Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_variance16x16_neon|
     13     EXPORT  |vp8_variance16x8_neon|
     14     EXPORT  |vp8_variance8x16_neon|
     15     EXPORT  |vp8_variance8x8_neon|
     16 
     17     ARM
     18     REQUIRE8
     19     PRESERVE8
     20 
     21     AREA ||.text||, CODE, READONLY, ALIGN=2
     22 
     23 ; r0    unsigned char *src_ptr
     24 ; r1    int source_stride
     25 ; r2    unsigned char *ref_ptr
     26 ; r3    int  recon_stride
     27 ; stack unsigned int *sse
     28 |vp8_variance16x16_neon| PROC
     29     vmov.i8         q8, #0                      ;q8 - sum
     30     vmov.i8         q9, #0                      ;q9, q10 - sse
     31     vmov.i8         q10, #0
     32 
     33     mov             r12, #8
     34 
     35 variance16x16_neon_loop
     36     vld1.8          {q0}, [r0], r1              ;Load up source and reference
     37     vld1.8          {q2}, [r2], r3
     38     vld1.8          {q1}, [r0], r1
     39     vld1.8          {q3}, [r2], r3
     40 
     41     vsubl.u8        q11, d0, d4                 ;calculate diff
     42     vsubl.u8        q12, d1, d5
     43     vsubl.u8        q13, d2, d6
     44     vsubl.u8        q14, d3, d7
     45 
     46     ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
     47     ;the results into the elements of the destination vector. The explanation
     48     ;in ARM guide is wrong.
     49     vpadal.s16      q8, q11                     ;calculate sum
     50     vmlal.s16       q9, d22, d22                ;calculate sse
     51     vmlal.s16       q10, d23, d23
     52 
     53     subs            r12, r12, #1
     54 
     55     vpadal.s16      q8, q12
     56     vmlal.s16       q9, d24, d24
     57     vmlal.s16       q10, d25, d25
     58     vpadal.s16      q8, q13
     59     vmlal.s16       q9, d26, d26
     60     vmlal.s16       q10, d27, d27
     61     vpadal.s16      q8, q14
     62     vmlal.s16       q9, d28, d28
     63     vmlal.s16       q10, d29, d29
     64 
     65     bne             variance16x16_neon_loop
     66 
     67     vadd.u32        q10, q9, q10                ;accumulate sse
     68     vpaddl.s32      q0, q8                      ;accumulate sum
     69 
     70     ldr             r12, [sp]                   ;load *sse from stack
     71 
     72     vpaddl.u32      q1, q10
     73     vadd.s64        d0, d0, d1
     74     vadd.u64        d1, d2, d3
     75 
     76     ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
     77     ;vmov.32        r1, d1[0]
     78     ;mul            r0, r0, r0
     79     ;str            r1, [r12]
     80     ;sub            r0, r1, r0, asr #8
     81 
     82     ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
     83     ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
     84     vmull.s32       q5, d0, d0
     85     vst1.32         {d1[0]}, [r12]              ;store sse
     86     vshr.s32        d10, d10, #8
     87     vsub.s32        d0, d1, d10
     88 
     89     vmov.32         r0, d0[0]                   ;return
     90     bx              lr
     91 
     92     ENDP
     93 
     94 ;================================
     95 ;unsigned int vp8_variance16x8_c(
     96 ;    unsigned char *src_ptr,
     97 ;    int  source_stride,
     98 ;    unsigned char *ref_ptr,
     99 ;    int  recon_stride,
    100 ;   unsigned int *sse)
    101 |vp8_variance16x8_neon| PROC
    102     vmov.i8         q8, #0                      ;q8 - sum
    103     vmov.i8         q9, #0                      ;q9, q10 - sse
    104     vmov.i8         q10, #0
    105 
    106     mov             r12, #4
    107 
    108 variance16x8_neon_loop
    109     vld1.8          {q0}, [r0], r1              ;Load up source and reference
    110     vld1.8          {q2}, [r2], r3
    111     vld1.8          {q1}, [r0], r1
    112     vld1.8          {q3}, [r2], r3
    113 
    114     vsubl.u8        q11, d0, d4                 ;calculate diff
    115     vsubl.u8        q12, d1, d5
    116     vsubl.u8        q13, d2, d6
    117     vsubl.u8        q14, d3, d7
    118 
    119     vpadal.s16      q8, q11                     ;calculate sum
    120     vmlal.s16       q9, d22, d22                ;calculate sse
    121     vmlal.s16       q10, d23, d23
    122 
    123     subs            r12, r12, #1
    124 
    125     vpadal.s16      q8, q12
    126     vmlal.s16       q9, d24, d24
    127     vmlal.s16       q10, d25, d25
    128     vpadal.s16      q8, q13
    129     vmlal.s16       q9, d26, d26
    130     vmlal.s16       q10, d27, d27
    131     vpadal.s16      q8, q14
    132     vmlal.s16       q9, d28, d28
    133     vmlal.s16       q10, d29, d29
    134 
    135     bne             variance16x8_neon_loop
    136 
    137     vadd.u32        q10, q9, q10                ;accumulate sse
    138     vpaddl.s32      q0, q8                      ;accumulate sum
    139 
    140     ldr             r12, [sp]                   ;load *sse from stack
    141 
    142     vpaddl.u32      q1, q10
    143     vadd.s64        d0, d0, d1
    144     vadd.u64        d1, d2, d3
    145 
    146     vmull.s32       q5, d0, d0
    147     vst1.32         {d1[0]}, [r12]              ;store sse
    148     vshr.s32        d10, d10, #7
    149     vsub.s32        d0, d1, d10
    150 
    151     vmov.32         r0, d0[0]                   ;return
    152     bx              lr
    153 
    154     ENDP
    155 
    156 ;=================================
    157 ;unsigned int vp8_variance8x16_c(
    158 ;    unsigned char *src_ptr,
    159 ;    int  source_stride,
    160 ;    unsigned char *ref_ptr,
    161 ;    int  recon_stride,
    162 ;   unsigned int *sse)
    163 
    164 |vp8_variance8x16_neon| PROC
    165     vmov.i8         q8, #0                      ;q8 - sum
    166     vmov.i8         q9, #0                      ;q9, q10 - sse
    167     vmov.i8         q10, #0
    168 
    169     mov             r12, #8
    170 
    171 variance8x16_neon_loop
    172     vld1.8          {d0}, [r0], r1              ;Load up source and reference
    173     vld1.8          {d4}, [r2], r3
    174     vld1.8          {d2}, [r0], r1
    175     vld1.8          {d6}, [r2], r3
    176 
    177     vsubl.u8        q11, d0, d4                 ;calculate diff
    178     vsubl.u8        q12, d2, d6
    179 
    180     vpadal.s16      q8, q11                     ;calculate sum
    181     vmlal.s16       q9, d22, d22                ;calculate sse
    182     vmlal.s16       q10, d23, d23
    183 
    184     subs            r12, r12, #1
    185 
    186     vpadal.s16      q8, q12
    187     vmlal.s16       q9, d24, d24
    188     vmlal.s16       q10, d25, d25
    189 
    190     bne             variance8x16_neon_loop
    191 
    192     vadd.u32        q10, q9, q10                ;accumulate sse
    193     vpaddl.s32      q0, q8                      ;accumulate sum
    194 
    195     ldr             r12, [sp]                   ;load *sse from stack
    196 
    197     vpaddl.u32      q1, q10
    198     vadd.s64        d0, d0, d1
    199     vadd.u64        d1, d2, d3
    200 
    201     vmull.s32       q5, d0, d0
    202     vst1.32         {d1[0]}, [r12]              ;store sse
    203     vshr.s32        d10, d10, #7
    204     vsub.s32        d0, d1, d10
    205 
    206     vmov.32         r0, d0[0]                   ;return
    207     bx              lr
    208 
    209     ENDP
    210 
    211 ;==================================
    212 ; r0    unsigned char *src_ptr
    213 ; r1    int source_stride
    214 ; r2    unsigned char *ref_ptr
    215 ; r3    int  recon_stride
    216 ; stack unsigned int *sse
    217 |vp8_variance8x8_neon| PROC
    218     vmov.i8         q8, #0                      ;q8 - sum
    219     vmov.i8         q9, #0                      ;q9, q10 - sse
    220     vmov.i8         q10, #0
    221 
    222     mov             r12, #2
    223 
    224 variance8x8_neon_loop
    225     vld1.8          {d0}, [r0], r1              ;Load up source and reference
    226     vld1.8          {d4}, [r2], r3
    227     vld1.8          {d1}, [r0], r1
    228     vld1.8          {d5}, [r2], r3
    229     vld1.8          {d2}, [r0], r1
    230     vld1.8          {d6}, [r2], r3
    231     vld1.8          {d3}, [r0], r1
    232     vld1.8          {d7}, [r2], r3
    233 
    234     vsubl.u8        q11, d0, d4                 ;calculate diff
    235     vsubl.u8        q12, d1, d5
    236     vsubl.u8        q13, d2, d6
    237     vsubl.u8        q14, d3, d7
    238 
    239     vpadal.s16      q8, q11                     ;calculate sum
    240     vmlal.s16       q9, d22, d22                ;calculate sse
    241     vmlal.s16       q10, d23, d23
    242 
    243     subs            r12, r12, #1
    244 
    245     vpadal.s16      q8, q12
    246     vmlal.s16       q9, d24, d24
    247     vmlal.s16       q10, d25, d25
    248     vpadal.s16      q8, q13
    249     vmlal.s16       q9, d26, d26
    250     vmlal.s16       q10, d27, d27
    251     vpadal.s16      q8, q14
    252     vmlal.s16       q9, d28, d28
    253     vmlal.s16       q10, d29, d29
    254 
    255     bne             variance8x8_neon_loop
    256 
    257     vadd.u32        q10, q9, q10                ;accumulate sse
    258     vpaddl.s32      q0, q8                      ;accumulate sum
    259 
    260     ldr             r12, [sp]                   ;load *sse from stack
    261 
    262     vpaddl.u32      q1, q10
    263     vadd.s64        d0, d0, d1
    264     vadd.u64        d1, d2, d3
    265 
    266     vmull.s32       q5, d0, d0
    267     vst1.32         {d1[0]}, [r12]              ;store sse
    268     vshr.s32        d10, d10, #6
    269     vsub.s32        d0, d1, d10
    270 
    271     vmov.32         r0, d0[0]                   ;return
    272     bx              lr
    273 
    274     ENDP
    275 
    276     END
    277