Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8_subtract_b_neon_func|
     13     EXPORT |vp8_subtract_mby_neon|
     14     EXPORT |vp8_subtract_mbuv_neon|
     15 
     16     ARM
     17     REQUIRE8
     18     PRESERVE8
     19 
     20     AREA ||.text||, CODE, READONLY, ALIGN=2
     21 ;=========================================
     22 ;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
     23 |vp8_subtract_b_neon_func| PROC
     24     ldr             r12, [sp]               ;load pitch
     25 
     26     vld1.8          {d0}, [r1], r3          ;load src
     27     vld1.8          {d1}, [r2], r12         ;load pred
     28     vld1.8          {d2}, [r1], r3
     29     vld1.8          {d3}, [r2], r12
     30     vld1.8          {d4}, [r1], r3
     31     vld1.8          {d5}, [r2], r12
     32     vld1.8          {d6}, [r1], r3
     33     vld1.8          {d7}, [r2], r12
     34 
     35     vsubl.u8        q10, d0, d1
     36     vsubl.u8        q11, d2, d3
     37     vsubl.u8        q12, d4, d5
     38     vsubl.u8        q13, d6, d7
     39 
     40     mov             r12, r12, lsl #1
     41 
     42     vst1.16         {d20}, [r0], r12        ;store diff
     43     vst1.16         {d22}, [r0], r12
     44     vst1.16         {d24}, [r0], r12
     45     vst1.16         {d26}, [r0], r12
     46 
     47     bx              lr
     48     ENDP
     49 
     50 ;==========================================
     51 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
     52 |vp8_subtract_mby_neon| PROC
     53     mov             r12, #4
     54 
     55 subtract_mby_loop
     56     vld1.8          {q0}, [r1], r3          ;load src
     57     vld1.8          {q1}, [r2]!             ;load pred
     58     vld1.8          {q2}, [r1], r3
     59     vld1.8          {q3}, [r2]!
     60     vld1.8          {q4}, [r1], r3
     61     vld1.8          {q5}, [r2]!
     62     vld1.8          {q6}, [r1], r3
     63     vld1.8          {q7}, [r2]!
     64 
     65     vsubl.u8        q8, d0, d2
     66     vsubl.u8        q9, d1, d3
     67     vsubl.u8        q10, d4, d6
     68     vsubl.u8        q11, d5, d7
     69     vsubl.u8        q12, d8, d10
     70     vsubl.u8        q13, d9, d11
     71     vsubl.u8        q14, d12, d14
     72     vsubl.u8        q15, d13, d15
     73 
     74     vst1.16         {q8}, [r0]!             ;store diff
     75     vst1.16         {q9}, [r0]!
     76     vst1.16         {q10}, [r0]!
     77     vst1.16         {q11}, [r0]!
     78     vst1.16         {q12}, [r0]!
     79     vst1.16         {q13}, [r0]!
     80     vst1.16         {q14}, [r0]!
     81     vst1.16         {q15}, [r0]!
     82 
     83     subs            r12, r12, #1
     84     bne             subtract_mby_loop
     85 
     86     bx              lr
     87     ENDP
     88 
     89 ;=================================
     90 ;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
     91 |vp8_subtract_mbuv_neon| PROC
     92     ldr             r12, [sp]
     93 
     94 ;u
     95     add             r0, r0, #512        ;   short *udiff = diff + 256;
     96     add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
     97 
     98     vld1.8          {d0}, [r1], r12         ;load src
     99     vld1.8          {d1}, [r3]!             ;load pred
    100     vld1.8          {d2}, [r1], r12
    101     vld1.8          {d3}, [r3]!
    102     vld1.8          {d4}, [r1], r12
    103     vld1.8          {d5}, [r3]!
    104     vld1.8          {d6}, [r1], r12
    105     vld1.8          {d7}, [r3]!
    106     vld1.8          {d8}, [r1], r12
    107     vld1.8          {d9}, [r3]!
    108     vld1.8          {d10}, [r1], r12
    109     vld1.8          {d11}, [r3]!
    110     vld1.8          {d12}, [r1], r12
    111     vld1.8          {d13}, [r3]!
    112     vld1.8          {d14}, [r1], r12
    113     vld1.8          {d15}, [r3]!
    114 
    115     vsubl.u8        q8, d0, d1
    116     vsubl.u8        q9, d2, d3
    117     vsubl.u8        q10, d4, d5
    118     vsubl.u8        q11, d6, d7
    119     vsubl.u8        q12, d8, d9
    120     vsubl.u8        q13, d10, d11
    121     vsubl.u8        q14, d12, d13
    122     vsubl.u8        q15, d14, d15
    123 
    124     vst1.16         {q8}, [r0]!             ;store diff
    125     vst1.16         {q9}, [r0]!
    126     vst1.16         {q10}, [r0]!
    127     vst1.16         {q11}, [r0]!
    128     vst1.16         {q12}, [r0]!
    129     vst1.16         {q13}, [r0]!
    130     vst1.16         {q14}, [r0]!
    131     vst1.16         {q15}, [r0]!
    132 
    133 ;v
    134     vld1.8          {d0}, [r2], r12         ;load src
    135     vld1.8          {d1}, [r3]!             ;load pred
    136     vld1.8          {d2}, [r2], r12
    137     vld1.8          {d3}, [r3]!
    138     vld1.8          {d4}, [r2], r12
    139     vld1.8          {d5}, [r3]!
    140     vld1.8          {d6}, [r2], r12
    141     vld1.8          {d7}, [r3]!
    142     vld1.8          {d8}, [r2], r12
    143     vld1.8          {d9}, [r3]!
    144     vld1.8          {d10}, [r2], r12
    145     vld1.8          {d11}, [r3]!
    146     vld1.8          {d12}, [r2], r12
    147     vld1.8          {d13}, [r3]!
    148     vld1.8          {d14}, [r2], r12
    149     vld1.8          {d15}, [r3]!
    150 
    151     vsubl.u8        q8, d0, d1
    152     vsubl.u8        q9, d2, d3
    153     vsubl.u8        q10, d4, d5
    154     vsubl.u8        q11, d6, d7
    155     vsubl.u8        q12, d8, d9
    156     vsubl.u8        q13, d10, d11
    157     vsubl.u8        q14, d12, d13
    158     vsubl.u8        q15, d14, d15
    159 
    160     vst1.16         {q8}, [r0]!             ;store diff
    161     vst1.16         {q9}, [r0]!
    162     vst1.16         {q10}, [r0]!
    163     vst1.16         {q11}, [r0]!
    164     vst1.16         {q12}, [r0]!
    165     vst1.16         {q13}, [r0]!
    166     vst1.16         {q14}, [r0]!
    167     vst1.16         {q15}, [r0]!
    168 
    169     bx              lr
    170     ENDP
    171 
    172     END
    173