Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_sub_pixel_variance8x8_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA ||.text||, CODE, READONLY, ALIGN=2
     18 ; r0    unsigned char  *src_ptr,
     19 ; r1    int  src_pixels_per_line,
     20 ; r2    int  xoffset,
     21 ; r3    int  yoffset,
     22 ; stack(r4) unsigned char *dst_ptr,
     23 ; stack(r5) int dst_pixels_per_line,
     24 ; stack(r6) unsigned int *sse
     25 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
     26 
     27 |vp8_sub_pixel_variance8x8_neon| PROC
     28     push            {r4-r5, lr}
     29 
     30     adr             r12, bilinear_taps_coeff
     31     ldr             r4, [sp, #12]           ;load *dst_ptr from stack
     32     ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
     33     ldr             lr, [sp, #20]           ;load *sse from stack
     34 
     35     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     36     beq             skip_firstpass_filter
     37 
     38 ;First pass: output_height lines x output_width columns (9x8)
     39     add             r2, r12, r2, lsl #3     ;calculate filter location
     40 
     41     vld1.u8         {q1}, [r0], r1          ;load src data
     42     vld1.u32        {d31}, [r2]             ;load first_pass filter
     43     vld1.u8         {q2}, [r0], r1
     44     vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
     45     vld1.u8         {q3}, [r0], r1
     46     vdup.8          d1, d31[4]
     47     vld1.u8         {q4}, [r0], r1
     48 
     49     vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
     50     vmull.u8        q7, d4, d0
     51     vmull.u8        q8, d6, d0
     52     vmull.u8        q9, d8, d0
     53 
     54     vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
     55     vext.8          d5, d4, d5, #1
     56     vext.8          d7, d6, d7, #1
     57     vext.8          d9, d8, d9, #1
     58 
     59     vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
     60     vmlal.u8        q7, d5, d1
     61     vmlal.u8        q8, d7, d1
     62     vmlal.u8        q9, d9, d1
     63 
     64     vld1.u8         {q1}, [r0], r1          ;load src data
     65     vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
     66     vld1.u8         {q2}, [r0], r1
     67     vqrshrn.u16    d23, q7, #7
     68     vld1.u8         {q3}, [r0], r1
     69     vqrshrn.u16    d24, q8, #7
     70     vld1.u8         {q4}, [r0], r1
     71     vqrshrn.u16    d25, q9, #7
     72 
     73     ;first_pass filtering on the rest 5-line data
     74     vld1.u8         {q5}, [r0], r1
     75 
     76     vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
     77     vmull.u8        q7, d4, d0
     78     vmull.u8        q8, d6, d0
     79     vmull.u8        q9, d8, d0
     80     vmull.u8        q10, d10, d0
     81 
     82     vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
     83     vext.8          d5, d4, d5, #1
     84     vext.8          d7, d6, d7, #1
     85     vext.8          d9, d8, d9, #1
     86     vext.8          d11, d10, d11, #1
     87 
     88     vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
     89     vmlal.u8        q7, d5, d1
     90     vmlal.u8        q8, d7, d1
     91     vmlal.u8        q9, d9, d1
     92     vmlal.u8        q10, d11, d1
     93 
     94     vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
     95     vqrshrn.u16    d27, q7, #7
     96     vqrshrn.u16    d28, q8, #7
     97     vqrshrn.u16    d29, q9, #7
     98     vqrshrn.u16    d30, q10, #7
     99 
    100 ;Second pass: 8x8
    101 secondpass_filter
    102     cmp             r3, #0                  ;skip second_pass filter if yoffset=0
    103     ;skip_secondpass_filter
    104     beq             sub_pixel_variance8x8_neon
    105 
    106     add             r3, r12, r3, lsl #3
    107 
    108     vld1.u32        {d31}, [r3]             ;load second_pass filter
    109 
    110     vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
    111     vdup.8          d1, d31[4]
    112 
    113     vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
    114     vmull.u8        q2, d23, d0
    115     vmull.u8        q3, d24, d0
    116     vmull.u8        q4, d25, d0
    117     vmull.u8        q5, d26, d0
    118     vmull.u8        q6, d27, d0
    119     vmull.u8        q7, d28, d0
    120     vmull.u8        q8, d29, d0
    121 
    122     vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
    123     vmlal.u8        q2, d24, d1
    124     vmlal.u8        q3, d25, d1
    125     vmlal.u8        q4, d26, d1
    126     vmlal.u8        q5, d27, d1
    127     vmlal.u8        q6, d28, d1
    128     vmlal.u8        q7, d29, d1
    129     vmlal.u8        q8, d30, d1
    130 
    131     vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
    132     vqrshrn.u16    d23, q2, #7
    133     vqrshrn.u16    d24, q3, #7
    134     vqrshrn.u16    d25, q4, #7
    135     vqrshrn.u16    d26, q5, #7
    136     vqrshrn.u16    d27, q6, #7
    137     vqrshrn.u16    d28, q7, #7
    138     vqrshrn.u16    d29, q8, #7
    139 
    140     b               sub_pixel_variance8x8_neon
    141 
    142 ;--------------------
    143 skip_firstpass_filter
    144     vld1.u8         {d22}, [r0], r1         ;load src data
    145     vld1.u8         {d23}, [r0], r1
    146     vld1.u8         {d24}, [r0], r1
    147     vld1.u8         {d25}, [r0], r1
    148     vld1.u8         {d26}, [r0], r1
    149     vld1.u8         {d27}, [r0], r1
    150     vld1.u8         {d28}, [r0], r1
    151     vld1.u8         {d29}, [r0], r1
    152     vld1.u8         {d30}, [r0], r1
    153 
    154     b               secondpass_filter
    155 
    156 ;----------------------
    157 ;vp8_variance8x8_neon
    158 sub_pixel_variance8x8_neon
    159     vmov.i8         q8, #0                      ;q8 - sum
    160     vmov.i8         q9, #0                      ;q9, q10 - sse
    161     vmov.i8         q10, #0
    162 
    163     mov             r12, #2
    164 
    165 sub_pixel_variance8x8_neon_loop
    166     vld1.8          {d0}, [r4], r5              ;load dst data
    167     subs            r12, r12, #1
    168     vld1.8          {d1}, [r4], r5
    169     vld1.8          {d2}, [r4], r5
    170     vsubl.u8        q4, d22, d0                 ;calculate diff
    171     vld1.8          {d3}, [r4], r5
    172 
    173     vsubl.u8        q5, d23, d1
    174     vsubl.u8        q6, d24, d2
    175 
    176     vpadal.s16      q8, q4                      ;sum
    177     vmlal.s16       q9, d8, d8                  ;sse
    178     vmlal.s16       q10, d9, d9
    179 
    180     vsubl.u8        q7, d25, d3
    181 
    182     vpadal.s16      q8, q5
    183     vmlal.s16       q9, d10, d10
    184     vmlal.s16       q10, d11, d11
    185 
    186     vmov            q11, q13
    187 
    188     vpadal.s16      q8, q6
    189     vmlal.s16       q9, d12, d12
    190     vmlal.s16       q10, d13, d13
    191 
    192     vmov            q12, q14
    193 
    194     vpadal.s16      q8, q7
    195     vmlal.s16       q9, d14, d14
    196     vmlal.s16       q10, d15, d15
    197 
    198     bne             sub_pixel_variance8x8_neon_loop
    199 
    200     vadd.u32        q10, q9, q10                ;accumulate sse
    201     vpaddl.s32      q0, q8                      ;accumulate sum
    202 
    203     vpaddl.u32      q1, q10
    204     vadd.s64        d0, d0, d1
    205     vadd.u64        d1, d2, d3
    206 
    207     vmull.s32       q5, d0, d0
    208     vst1.32         {d1[0]}, [lr]               ;store sse
    209     vshr.u32        d10, d10, #6
    210     vsub.u32        d0, d1, d10
    211 
    212     vmov.32         r0, d0[0]                   ;return
    213     pop             {r4-r5, pc}
    214 
    215     ENDP
    216 
    217 ;-----------------
    218 
    219 bilinear_taps_coeff
    220     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
    221 
    222     END
    223