Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11     EXPORT |vp8_subtract_b_neon|
     12     EXPORT |vp8_subtract_mby_neon|
     13     EXPORT |vp8_subtract_mbuv_neon|
     14 
     15     INCLUDE asm_enc_offsets.asm
     16 
     17     ARM
     18     REQUIRE8
     19     PRESERVE8
     20 
     21     AREA ||.text||, CODE, READONLY, ALIGN=2
     22 
     23 ;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
     24 |vp8_subtract_b_neon| PROC
     25 
     26     stmfd   sp!, {r4-r7}
     27 
     28     ldr     r3, [r0, #vp8_block_base_src]
     29     ldr     r4, [r0, #vp8_block_src]
     30     ldr     r5, [r0, #vp8_block_src_diff]
     31     ldr     r3, [r3]
     32     ldr     r6, [r0, #vp8_block_src_stride]
     33     add     r3, r3, r4                      ; src = *base_src + src
     34     ldr     r7, [r1, #vp8_blockd_predictor]
     35 
     36     vld1.8          {d0}, [r3], r6          ;load src
     37     vld1.8          {d1}, [r7], r2          ;load pred
     38     vld1.8          {d2}, [r3], r6
     39     vld1.8          {d3}, [r7], r2
     40     vld1.8          {d4}, [r3], r6
     41     vld1.8          {d5}, [r7], r2
     42     vld1.8          {d6}, [r3], r6
     43     vld1.8          {d7}, [r7], r2
     44 
     45     vsubl.u8        q10, d0, d1
     46     vsubl.u8        q11, d2, d3
     47     vsubl.u8        q12, d4, d5
     48     vsubl.u8        q13, d6, d7
     49 
     50     mov             r2, r2, lsl #1
     51 
     52     vst1.16         {d20}, [r5], r2         ;store diff
     53     vst1.16         {d22}, [r5], r2
     54     vst1.16         {d24}, [r5], r2
     55     vst1.16         {d26}, [r5], r2
     56 
     57     ldmfd   sp!, {r4-r7}
     58     bx              lr
     59 
     60     ENDP
     61 
     62 
     63 ;==========================================
     64 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
     65 |vp8_subtract_mby_neon| PROC
     66     mov             r12, #4
     67 
     68 subtract_mby_loop
     69     vld1.8          {q0}, [r1], r3          ;load src
     70     vld1.8          {q1}, [r2]!             ;load pred
     71     vld1.8          {q2}, [r1], r3
     72     vld1.8          {q3}, [r2]!
     73     vld1.8          {q4}, [r1], r3
     74     vld1.8          {q5}, [r2]!
     75     vld1.8          {q6}, [r1], r3
     76     vld1.8          {q7}, [r2]!
     77 
     78     vsubl.u8        q8, d0, d2
     79     vsubl.u8        q9, d1, d3
     80     vsubl.u8        q10, d4, d6
     81     vsubl.u8        q11, d5, d7
     82     vsubl.u8        q12, d8, d10
     83     vsubl.u8        q13, d9, d11
     84     vsubl.u8        q14, d12, d14
     85     vsubl.u8        q15, d13, d15
     86 
     87     vst1.16         {q8}, [r0]!             ;store diff
     88     vst1.16         {q9}, [r0]!
     89     vst1.16         {q10}, [r0]!
     90     vst1.16         {q11}, [r0]!
     91     vst1.16         {q12}, [r0]!
     92     vst1.16         {q13}, [r0]!
     93     vst1.16         {q14}, [r0]!
     94     vst1.16         {q15}, [r0]!
     95 
     96     subs            r12, r12, #1
     97     bne             subtract_mby_loop
     98 
     99     bx              lr
    100     ENDP
    101 
    102 ;=================================
    103 ;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
    104 |vp8_subtract_mbuv_neon| PROC
    105     ldr             r12, [sp]
    106 
    107 ;u
    108     add             r0, r0, #512        ;   short *udiff = diff + 256;
    109     add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
    110 
    111     vld1.8          {d0}, [r1], r12         ;load src
    112     vld1.8          {d1}, [r3]!             ;load pred
    113     vld1.8          {d2}, [r1], r12
    114     vld1.8          {d3}, [r3]!
    115     vld1.8          {d4}, [r1], r12
    116     vld1.8          {d5}, [r3]!
    117     vld1.8          {d6}, [r1], r12
    118     vld1.8          {d7}, [r3]!
    119     vld1.8          {d8}, [r1], r12
    120     vld1.8          {d9}, [r3]!
    121     vld1.8          {d10}, [r1], r12
    122     vld1.8          {d11}, [r3]!
    123     vld1.8          {d12}, [r1], r12
    124     vld1.8          {d13}, [r3]!
    125     vld1.8          {d14}, [r1], r12
    126     vld1.8          {d15}, [r3]!
    127 
    128     vsubl.u8        q8, d0, d1
    129     vsubl.u8        q9, d2, d3
    130     vsubl.u8        q10, d4, d5
    131     vsubl.u8        q11, d6, d7
    132     vsubl.u8        q12, d8, d9
    133     vsubl.u8        q13, d10, d11
    134     vsubl.u8        q14, d12, d13
    135     vsubl.u8        q15, d14, d15
    136 
    137     vst1.16         {q8}, [r0]!             ;store diff
    138     vst1.16         {q9}, [r0]!
    139     vst1.16         {q10}, [r0]!
    140     vst1.16         {q11}, [r0]!
    141     vst1.16         {q12}, [r0]!
    142     vst1.16         {q13}, [r0]!
    143     vst1.16         {q14}, [r0]!
    144     vst1.16         {q15}, [r0]!
    145 
    146 ;v
    147     vld1.8          {d0}, [r2], r12         ;load src
    148     vld1.8          {d1}, [r3]!             ;load pred
    149     vld1.8          {d2}, [r2], r12
    150     vld1.8          {d3}, [r3]!
    151     vld1.8          {d4}, [r2], r12
    152     vld1.8          {d5}, [r3]!
    153     vld1.8          {d6}, [r2], r12
    154     vld1.8          {d7}, [r3]!
    155     vld1.8          {d8}, [r2], r12
    156     vld1.8          {d9}, [r3]!
    157     vld1.8          {d10}, [r2], r12
    158     vld1.8          {d11}, [r3]!
    159     vld1.8          {d12}, [r2], r12
    160     vld1.8          {d13}, [r3]!
    161     vld1.8          {d14}, [r2], r12
    162     vld1.8          {d15}, [r3]!
    163 
    164     vsubl.u8        q8, d0, d1
    165     vsubl.u8        q9, d2, d3
    166     vsubl.u8        q10, d4, d5
    167     vsubl.u8        q11, d6, d7
    168     vsubl.u8        q12, d8, d9
    169     vsubl.u8        q13, d10, d11
    170     vsubl.u8        q14, d12, d13
    171     vsubl.u8        q15, d14, d15
    172 
    173     vst1.16         {q8}, [r0]!             ;store diff
    174     vst1.16         {q9}, [r0]!
    175     vst1.16         {q10}, [r0]!
    176     vst1.16         {q11}, [r0]!
    177     vst1.16         {q12}, [r0]!
    178     vst1.16         {q13}, [r0]!
    179     vst1.16         {q14}, [r0]!
    180     vst1.16         {q15}, [r0]!
    181 
    182     bx              lr
    183     ENDP
    184 
    185     END
    186