Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_mse16x16_neon|
     13     EXPORT  |vp8_get16x16pred_error_neon|
     14     EXPORT  |vp8_get4x4sse_cs_neon|
     15 
     16     ARM
     17     REQUIRE8
     18     PRESERVE8
     19 
     20     AREA ||.text||, CODE, READONLY, ALIGN=2
     21 ;============================
     22 ; r0    unsigned char *src_ptr
     23 ; r1    int source_stride
     24 ; r2    unsigned char *ref_ptr
     25 ; r3    int  recon_stride
     26 ; stack unsigned int *sse
     27 ;note: in this function, sum is never used. So, we can remove this part of calculation
     28 ;from vp8_variance().
     29 
     30 |vp8_mse16x16_neon| PROC
     31     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
     32     vmov.i8         q8, #0
     33     vmov.i8         q9, #0
     34     vmov.i8         q10, #0
     35 
     36     mov             r12, #8
     37 
     38 mse16x16_neon_loop
     39     vld1.8          {q0}, [r0], r1              ;Load up source and reference
     40     vld1.8          {q2}, [r2], r3
     41     vld1.8          {q1}, [r0], r1
     42     vld1.8          {q3}, [r2], r3
     43 
     44     vsubl.u8        q11, d0, d4
     45     vsubl.u8        q12, d1, d5
     46     vsubl.u8        q13, d2, d6
     47     vsubl.u8        q14, d3, d7
     48 
     49     vmlal.s16       q7, d22, d22
     50     vmlal.s16       q8, d23, d23
     51 
     52     subs            r12, r12, #1
     53 
     54     vmlal.s16       q9, d24, d24
     55     vmlal.s16       q10, d25, d25
     56     vmlal.s16       q7, d26, d26
     57     vmlal.s16       q8, d27, d27
     58     vmlal.s16       q9, d28, d28
     59     vmlal.s16       q10, d29, d29
     60 
     61     bne             mse16x16_neon_loop
     62 
     63     vadd.u32        q7, q7, q8
     64     vadd.u32        q9, q9, q10
     65 
     66     ldr             r12, [sp]               ;load *sse from stack
     67 
     68     vadd.u32        q10, q7, q9
     69     vpaddl.u32      q1, q10
     70     vadd.u64        d0, d2, d3
     71 
     72     vst1.32         {d0[0]}, [r12]
     73     vmov.32         r0, d0[0]
     74 
     75     bx              lr
     76 
     77     ENDP
     78 
     79 ;============================
     80 ; r0    unsigned char *src_ptr
     81 ; r1    int src_stride
     82 ; r2    unsigned char *ref_ptr
     83 ; r3    int ref_stride
     84 |vp8_get16x16pred_error_neon| PROC
     85     vmov.i8         q8, #0                      ;q8 - sum
     86     vmov.i8         q9, #0                      ;q9, q10 - pred_error
     87     vmov.i8         q10, #0
     88 
     89     mov             r12, #8
     90 
     91 get16x16pred_error_neon_loop
     92     vld1.8          {q0}, [r0], r1              ;Load up source and reference
     93     vld1.8          {q2}, [r2], r3
     94     vld1.8          {q1}, [r0], r1
     95     vld1.8          {q3}, [r2], r3
     96 
     97     vsubl.u8        q11, d0, d4
     98     vsubl.u8        q12, d1, d5
     99     vsubl.u8        q13, d2, d6
    100     vsubl.u8        q14, d3, d7
    101 
    102     vpadal.s16      q8, q11
    103     vmlal.s16       q9, d22, d22
    104     vmlal.s16       q10, d23, d23
    105 
    106     subs            r12, r12, #1
    107 
    108     vpadal.s16      q8, q12
    109     vmlal.s16       q9, d24, d24
    110     vmlal.s16       q10, d25, d25
    111     vpadal.s16      q8, q13
    112     vmlal.s16       q9, d26, d26
    113     vmlal.s16       q10, d27, d27
    114     vpadal.s16      q8, q14
    115     vmlal.s16       q9, d28, d28
    116     vmlal.s16       q10, d29, d29
    117 
    118     bne             get16x16pred_error_neon_loop
    119 
    120     vadd.u32        q10, q9, q10
    121     vpaddl.s32      q0, q8
    122 
    123     vpaddl.u32      q1, q10
    124     vadd.s64        d0, d0, d1
    125     vadd.u64        d1, d2, d3
    126 
    127     vmull.s32       q5, d0, d0
    128     vshr.s32        d10, d10, #8
    129     vsub.s32        d0, d1, d10
    130 
    131     vmov.32         r0, d0[0]
    132     bx              lr
    133 
    134     ENDP
    135 
    136 ;=============================
    137 ; r0    unsigned char *src_ptr,
    138 ; r1    int  source_stride,
    139 ; r2    unsigned char *ref_ptr,
    140 ; r3    int  recon_stride
    141 |vp8_get4x4sse_cs_neon| PROC
    142     vld1.8          {d0}, [r0], r1              ;Load up source and reference
    143     vld1.8          {d4}, [r2], r3
    144     vld1.8          {d1}, [r0], r1
    145     vld1.8          {d5}, [r2], r3
    146     vld1.8          {d2}, [r0], r1
    147     vld1.8          {d6}, [r2], r3
    148     vld1.8          {d3}, [r0], r1
    149     vld1.8          {d7}, [r2], r3
    150 
    151     vsubl.u8        q11, d0, d4
    152     vsubl.u8        q12, d1, d5
    153     vsubl.u8        q13, d2, d6
    154     vsubl.u8        q14, d3, d7
    155 
    156     vmull.s16       q7, d22, d22
    157     vmull.s16       q8, d24, d24
    158     vmull.s16       q9, d26, d26
    159     vmull.s16       q10, d28, d28
    160 
    161     vadd.u32        q7, q7, q8
    162     vadd.u32        q9, q9, q10
    163     vadd.u32        q9, q7, q9
    164 
    165     vpaddl.u32      q1, q9
    166     vadd.u64        d0, d2, d3
    167 
    168     vmov.32         r0, d0[0]
    169     bx              lr
    170 
    171     ENDP
    172 
    173     END
    174