Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_bilinear_predict8x4_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA ||.text||, CODE, READONLY, ALIGN=2
     18 ; r0    unsigned char  *src_ptr,
     19 ; r1    int  src_pixels_per_line,
     20 ; r2    int  xoffset,
     21 ; r3    int  yoffset,
     22 ; r4    unsigned char *dst_ptr,
     23 ; stack(lr) int  dst_pitch
     24 
     25 |vp8_bilinear_predict8x4_neon| PROC
     26     push            {r4, lr}
     27 
     28     ldr             r12, _bifilter8x4_coeff_
     29     ldr             r4, [sp, #8]            ;load parameters from stack
     30     ldr             lr, [sp, #12]           ;load parameters from stack
     31 
     32     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     33     beq             skip_firstpass_filter
     34 
     35 ;First pass: output_height lines x output_width columns (5x8)
     36     add             r2, r12, r2, lsl #3     ;calculate filter location
     37 
     38     vld1.u8         {q1}, [r0], r1          ;load src data
     39     vld1.u32        {d31}, [r2]             ;load first_pass filter
     40     vld1.u8         {q2}, [r0], r1
     41     vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
     42     vld1.u8         {q3}, [r0], r1
     43     vdup.8          d1, d31[4]
     44     vld1.u8         {q4}, [r0], r1
     45 
     46     vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
     47     vld1.u8         {q5}, [r0], r1
     48     vmull.u8        q7, d4, d0
     49     vmull.u8        q8, d6, d0
     50     vmull.u8        q9, d8, d0
     51     vmull.u8        q10, d10, d0
     52 
     53     vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
     54     vext.8          d5, d4, d5, #1
     55     vext.8          d7, d6, d7, #1
     56     vext.8          d9, d8, d9, #1
     57     vext.8          d11, d10, d11, #1
     58 
     59     vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
     60     vmlal.u8        q7, d5, d1
     61     vmlal.u8        q8, d7, d1
     62     vmlal.u8        q9, d9, d1
     63     vmlal.u8        q10, d11, d1
     64 
     65     vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
     66     vqrshrn.u16    d23, q7, #7
     67     vqrshrn.u16    d24, q8, #7
     68     vqrshrn.u16    d25, q9, #7
     69     vqrshrn.u16    d26, q10, #7
     70 
     71 ;Second pass: 4x8
     72 secondpass_filter
     73     cmp             r3, #0                  ;skip second_pass filter if yoffset=0
     74     beq             skip_secondpass_filter
     75 
     76     add             r3, r12, r3, lsl #3
     77     add             r0, r4, lr
     78 
     79     vld1.u32        {d31}, [r3]             ;load second_pass filter
     80     add             r1, r0, lr
     81 
     82     vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
     83     vdup.8          d1, d31[4]
     84 
     85     vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
     86     vmull.u8        q2, d23, d0
     87     vmull.u8        q3, d24, d0
     88     vmull.u8        q4, d25, d0
     89 
     90     vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
     91     vmlal.u8        q2, d24, d1
     92     vmlal.u8        q3, d25, d1
     93     vmlal.u8        q4, d26, d1
     94 
     95     add             r2, r1, lr
     96 
     97     vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
     98     vqrshrn.u16    d3, q2, #7
     99     vqrshrn.u16    d4, q3, #7
    100     vqrshrn.u16    d5, q4, #7
    101 
    102     vst1.u8         {d2}, [r4]              ;store result
    103     vst1.u8         {d3}, [r0]
    104     vst1.u8         {d4}, [r1]
    105     vst1.u8         {d5}, [r2]
    106 
    107     pop             {r4, pc}
    108 
    109 ;--------------------
    110 skip_firstpass_filter
    111     vld1.u8         {d22}, [r0], r1         ;load src data
    112     vld1.u8         {d23}, [r0], r1
    113     vld1.u8         {d24}, [r0], r1
    114     vld1.u8         {d25}, [r0], r1
    115     vld1.u8         {d26}, [r0], r1
    116 
    117     b               secondpass_filter
    118 
    119 ;---------------------
    120 skip_secondpass_filter
    121     vst1.u8         {d22}, [r4], lr         ;store result
    122     vst1.u8         {d23}, [r4], lr
    123     vst1.u8         {d24}, [r4], lr
    124     vst1.u8         {d25}, [r4], lr
    125 
    126     pop             {r4, pc}
    127 
    128     ENDP
    129 
    130 ;-----------------
    131     AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
    132 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
    133 ;One word each is reserved. Label filter_coeff can be used to access the data.
    134 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
    135 _bifilter8x4_coeff_
    136     DCD     bifilter8x4_coeff
    137 bifilter8x4_coeff
    138     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
    139 
    140     END
    141