Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA ||.text||, CODE, READONLY, ALIGN=2
     18 ;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
     19 ;are equal. So, in the code, only one load is needed
     20 ;for flimit. Same way applies to limit and thresh.
     21 ; r0    unsigned char *s,
     22 ; r1    int p, //pitch
     23 ; r2    const signed char *flimit,
     24 ; r3    const signed char *limit,
     25 ; stack(r4) const signed char *thresh,
     26 ; //stack(r5)   int count --unused
     27 
     28 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
     29     sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
     30 
     31     ldr         r12, _lfhy_coeff_
     32     vld1.u8     {q5}, [r0], r1              ; p1
     33     vld1.s8     {d2[], d3[]}, [r2]          ; flimit
     34     vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
     35     vld1.u8     {q6}, [r0], r1              ; p0
     36     vld1.u8     {q0}, [r12]!                ; 0x80
     37     vld1.u8     {q7}, [r0], r1              ; q0
     38     vld1.u8     {q10}, [r12]!               ; 0x03
     39     vld1.u8     {q8}, [r0]                  ; q1
     40 
     41     ;vp8_filter_mask() function
     42     vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
     43     vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
     44     vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
     45     vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
     46     vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
     47 
     48     ;vp8_filter() function
     49     veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
     50     veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
     51     veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
     52     veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
     53 
     54     vadd.u8     q1, q1, q1                  ; flimit * 2
     55     vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
     56     vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
     57 
     58 ;;;;;;;;;;
     59     ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
     60     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     61     vsubl.s8    q3, d15, d13
     62 
     63     vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
     64 
     65     ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
     66     vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
     67     vadd.s16    q12, q3, q3
     68 
     69     vld1.u8     {q9}, [r12]!                ; 0x04
     70 
     71     vadd.s16    q2, q2, q11
     72     vadd.s16    q3, q3, q12
     73 
     74     vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
     75     vaddw.s8    q3, q3, d9
     76 
     77     ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     78     vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     79     vqmovn.s16  d9, q3
     80 ;;;;;;;;;;;;;
     81 
     82     vand        q4, q4, q15                 ; vp8_filter &= mask
     83 
     84     vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
     85     vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
     86     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
     87     vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
     88 
     89     sub         r0, r0, r1, lsl #1
     90 
     91     ;calculate output
     92     vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
     93     vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
     94 
     95     add         r3, r0, r1
     96 
     97     veor        q6, q11, q0                 ; *op0 = u^0x80
     98     veor        q7, q10, q0                 ; *oq0 = u^0x80
     99 
    100     vst1.u8     {q6}, [r0]                  ; store op0
    101     vst1.u8     {q7}, [r3]                  ; store oq0
    102 
    103     bx          lr
    104     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
    105 
    106 ;-----------------
    107     AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
    108 ;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
    109 ;One word each is reserved. Label filter_coeff can be used to access the data.
    110 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
    111 _lfhy_coeff_
    112     DCD     lfhy_coeff
    113 lfhy_coeff
    114     DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
    115     DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
    116     DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
    117 
    118     END
    119