Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11     EXPORT |vp8_subtract_b_neon|
     12     EXPORT |vp8_subtract_mby_neon|
     13     EXPORT |vp8_subtract_mbuv_neon|
     14 
     15     INCLUDE vp8_asm_enc_offsets.asm
     16 
     17     ARM
     18     REQUIRE8
     19     PRESERVE8
     20 
     21     AREA ||.text||, CODE, READONLY, ALIGN=2
     22 
     23 ;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
     24 |vp8_subtract_b_neon| PROC
     25 
     26     stmfd   sp!, {r4-r7}
     27 
     28     ldr     r3, [r0, #vp8_block_base_src]
     29     ldr     r4, [r0, #vp8_block_src]
     30     ldr     r5, [r0, #vp8_block_src_diff]
     31     ldr     r3, [r3]
     32     ldr     r6, [r0, #vp8_block_src_stride]
     33     add     r3, r3, r4                      ; src = *base_src + src
     34     ldr     r7, [r1, #vp8_blockd_predictor]
     35 
     36     vld1.8          {d0}, [r3], r6          ;load src
     37     vld1.8          {d1}, [r7], r2          ;load pred
     38     vld1.8          {d2}, [r3], r6
     39     vld1.8          {d3}, [r7], r2
     40     vld1.8          {d4}, [r3], r6
     41     vld1.8          {d5}, [r7], r2
     42     vld1.8          {d6}, [r3], r6
     43     vld1.8          {d7}, [r7], r2
     44 
     45     vsubl.u8        q10, d0, d1
     46     vsubl.u8        q11, d2, d3
     47     vsubl.u8        q12, d4, d5
     48     vsubl.u8        q13, d6, d7
     49 
     50     mov             r2, r2, lsl #1
     51 
     52     vst1.16         {d20}, [r5], r2         ;store diff
     53     vst1.16         {d22}, [r5], r2
     54     vst1.16         {d24}, [r5], r2
     55     vst1.16         {d26}, [r5], r2
     56 
     57     ldmfd   sp!, {r4-r7}
     58     bx              lr
     59 
     60     ENDP
     61 
     62 
     63 ;==========================================
     64 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
     65 ;                           unsigned char *pred, int pred_stride)
     66 |vp8_subtract_mby_neon| PROC
     67     push            {r4-r7}
     68     mov             r12, #4
     69     ldr             r4, [sp, #16]           ; pred_stride
     70     mov             r6, #32                 ; "diff" stride x2
     71     add             r5, r0, #16             ; second diff pointer
     72 
     73 subtract_mby_loop
     74     vld1.8          {q0}, [r1], r2          ;load src
     75     vld1.8          {q1}, [r3], r4          ;load pred
     76     vld1.8          {q2}, [r1], r2
     77     vld1.8          {q3}, [r3], r4
     78     vld1.8          {q4}, [r1], r2
     79     vld1.8          {q5}, [r3], r4
     80     vld1.8          {q6}, [r1], r2
     81     vld1.8          {q7}, [r3], r4
     82 
     83     vsubl.u8        q8, d0, d2
     84     vsubl.u8        q9, d1, d3
     85     vsubl.u8        q10, d4, d6
     86     vsubl.u8        q11, d5, d7
     87     vsubl.u8        q12, d8, d10
     88     vsubl.u8        q13, d9, d11
     89     vsubl.u8        q14, d12, d14
     90     vsubl.u8        q15, d13, d15
     91 
     92     vst1.16         {q8}, [r0], r6          ;store diff
     93     vst1.16         {q9}, [r5], r6
     94     vst1.16         {q10}, [r0], r6
     95     vst1.16         {q11}, [r5], r6
     96     vst1.16         {q12}, [r0], r6
     97     vst1.16         {q13}, [r5], r6
     98     vst1.16         {q14}, [r0], r6
     99     vst1.16         {q15}, [r5], r6
    100 
    101     subs            r12, r12, #1
    102     bne             subtract_mby_loop
    103 
    104     pop             {r4-r7}
    105     bx              lr
    106     ENDP
    107 
    108 ;=================================
    109 ;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
    110 ;                         int src_stride, unsigned char *upred,
    111 ;                         unsigned char *vpred, int pred_stride)
    112 
    113 |vp8_subtract_mbuv_neon| PROC
    114     push            {r4-r7}
    115     ldr             r4, [sp, #16]       ; upred
    116     ldr             r5, [sp, #20]       ; vpred
    117     ldr             r6, [sp, #24]       ; pred_stride
    118     add             r0, r0, #512        ; short *udiff = diff + 256;
    119     mov             r12, #32            ; "diff" stride x2
    120     add             r7, r0, #16         ; second diff pointer
    121 
    122 ;u
    123     vld1.8          {d0}, [r1], r3      ;load usrc
    124     vld1.8          {d1}, [r4], r6      ;load upred
    125     vld1.8          {d2}, [r1], r3
    126     vld1.8          {d3}, [r4], r6
    127     vld1.8          {d4}, [r1], r3
    128     vld1.8          {d5}, [r4], r6
    129     vld1.8          {d6}, [r1], r3
    130     vld1.8          {d7}, [r4], r6
    131     vld1.8          {d8}, [r1], r3
    132     vld1.8          {d9}, [r4], r6
    133     vld1.8          {d10}, [r1], r3
    134     vld1.8          {d11}, [r4], r6
    135     vld1.8          {d12}, [r1], r3
    136     vld1.8          {d13}, [r4], r6
    137     vld1.8          {d14}, [r1], r3
    138     vld1.8          {d15}, [r4], r6
    139 
    140     vsubl.u8        q8, d0, d1
    141     vsubl.u8        q9, d2, d3
    142     vsubl.u8        q10, d4, d5
    143     vsubl.u8        q11, d6, d7
    144     vsubl.u8        q12, d8, d9
    145     vsubl.u8        q13, d10, d11
    146     vsubl.u8        q14, d12, d13
    147     vsubl.u8        q15, d14, d15
    148 
    149     vst1.16         {q8}, [r0], r12     ;store diff
    150     vst1.16         {q9}, [r7], r12
    151     vst1.16         {q10}, [r0], r12
    152     vst1.16         {q11}, [r7], r12
    153     vst1.16         {q12}, [r0], r12
    154     vst1.16         {q13}, [r7], r12
    155     vst1.16         {q14}, [r0], r12
    156     vst1.16         {q15}, [r7], r12
    157 
    158 ;v
    159     vld1.8          {d0}, [r2], r3      ;load vsrc
    160     vld1.8          {d1}, [r5], r6      ;load vpred
    161     vld1.8          {d2}, [r2], r3
    162     vld1.8          {d3}, [r5], r6
    163     vld1.8          {d4}, [r2], r3
    164     vld1.8          {d5}, [r5], r6
    165     vld1.8          {d6}, [r2], r3
    166     vld1.8          {d7}, [r5], r6
    167     vld1.8          {d8}, [r2], r3
    168     vld1.8          {d9}, [r5], r6
    169     vld1.8          {d10}, [r2], r3
    170     vld1.8          {d11}, [r5], r6
    171     vld1.8          {d12}, [r2], r3
    172     vld1.8          {d13}, [r5], r6
    173     vld1.8          {d14}, [r2], r3
    174     vld1.8          {d15}, [r5], r6
    175 
    176     vsubl.u8        q8, d0, d1
    177     vsubl.u8        q9, d2, d3
    178     vsubl.u8        q10, d4, d5
    179     vsubl.u8        q11, d6, d7
    180     vsubl.u8        q12, d8, d9
    181     vsubl.u8        q13, d10, d11
    182     vsubl.u8        q14, d12, d13
    183     vsubl.u8        q15, d14, d15
    184 
    185     vst1.16         {q8}, [r0], r12     ;store diff
    186     vst1.16         {q9}, [r7], r12
    187     vst1.16         {q10}, [r0], r12
    188     vst1.16         {q11}, [r7], r12
    189     vst1.16         {q12}, [r0], r12
    190     vst1.16         {q13}, [r7], r12
    191     vst1.16         {q14}, [r0], r12
    192     vst1.16         {q15}, [r7], r12
    193 
    194     pop             {r4-r7}
    195     bx              lr
    196 
    197     ENDP
    198 
    199     END
    200