Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_mse16x16_neon|
     13     EXPORT  |vp8_get4x4sse_cs_neon|
     14 
     15     ARM
     16     REQUIRE8
     17     PRESERVE8
     18 
     19     AREA ||.text||, CODE, READONLY, ALIGN=2
     20 ;============================
     21 ; r0    unsigned char *src_ptr
     22 ; r1    int source_stride
     23 ; r2    unsigned char *ref_ptr
     24 ; r3    int  recon_stride
     25 ; stack unsigned int *sse
     26 ;note: in this function, sum is never used. So, we can remove this part of calculation
     27 ;from vp8_variance().
     28 
     29 |vp8_mse16x16_neon| PROC
     30     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
     31     vmov.i8         q8, #0
     32     vmov.i8         q9, #0
     33     vmov.i8         q10, #0
     34 
     35     mov             r12, #8
     36 
     37 mse16x16_neon_loop
     38     vld1.8          {q0}, [r0], r1              ;Load up source and reference
     39     vld1.8          {q2}, [r2], r3
     40     vld1.8          {q1}, [r0], r1
     41     vld1.8          {q3}, [r2], r3
     42 
     43     vsubl.u8        q11, d0, d4
     44     vsubl.u8        q12, d1, d5
     45     vsubl.u8        q13, d2, d6
     46     vsubl.u8        q14, d3, d7
     47 
     48     vmlal.s16       q7, d22, d22
     49     vmlal.s16       q8, d23, d23
     50 
     51     subs            r12, r12, #1
     52 
     53     vmlal.s16       q9, d24, d24
     54     vmlal.s16       q10, d25, d25
     55     vmlal.s16       q7, d26, d26
     56     vmlal.s16       q8, d27, d27
     57     vmlal.s16       q9, d28, d28
     58     vmlal.s16       q10, d29, d29
     59 
     60     bne             mse16x16_neon_loop
     61 
     62     vadd.u32        q7, q7, q8
     63     vadd.u32        q9, q9, q10
     64 
     65     ldr             r12, [sp]               ;load *sse from stack
     66 
     67     vadd.u32        q10, q7, q9
     68     vpaddl.u32      q1, q10
     69     vadd.u64        d0, d2, d3
     70 
     71     vst1.32         {d0[0]}, [r12]
     72     vmov.32         r0, d0[0]
     73 
     74     bx              lr
     75 
     76     ENDP
     77 
     78 
     79 ;=============================
     80 ; r0    unsigned char *src_ptr,
     81 ; r1    int  source_stride,
     82 ; r2    unsigned char *ref_ptr,
     83 ; r3    int  recon_stride
     84 |vp8_get4x4sse_cs_neon| PROC
     85     vld1.8          {d0}, [r0], r1              ;Load up source and reference
     86     vld1.8          {d4}, [r2], r3
     87     vld1.8          {d1}, [r0], r1
     88     vld1.8          {d5}, [r2], r3
     89     vld1.8          {d2}, [r0], r1
     90     vld1.8          {d6}, [r2], r3
     91     vld1.8          {d3}, [r0], r1
     92     vld1.8          {d7}, [r2], r3
     93 
     94     vsubl.u8        q11, d0, d4
     95     vsubl.u8        q12, d1, d5
     96     vsubl.u8        q13, d2, d6
     97     vsubl.u8        q14, d3, d7
     98 
     99     vmull.s16       q7, d22, d22
    100     vmull.s16       q8, d24, d24
    101     vmull.s16       q9, d26, d26
    102     vmull.s16       q10, d28, d28
    103 
    104     vadd.u32        q7, q7, q8
    105     vadd.u32        q9, q9, q10
    106     vadd.u32        q9, q7, q9
    107 
    108     vpaddl.u32      q1, q9
    109     vadd.u64        d0, d2, d3
    110 
    111     vmov.32         r0, d0[0]
    112     bx              lr
    113 
    114     ENDP
    115 
    116     END
    117