Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_fast_fdct8x4_neon|
     13 
     14     ARM
     15     REQUIRE8
     16     PRESERVE8
     17 
     18     AREA ||.text||, CODE, READONLY, ALIGN=2
     19 ;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
     20 ;NOTE:
     21 ;The input *src_diff. src_diff is calculated as:
     22 ;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
     23 ;In which *src_ptr and *pred_ptr both are unsigned char.
     24 ;Therefore, *src_diff should be in the range of [-255, 255].
     25 ;CAUTION:
     26 ;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
     27 ;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
     28 ;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
     29 
     30 |vp8_fast_fdct8x4_neon| PROC
     31     vld1.16         {q1}, [r0], r2              ;load input
     32     ldr             r12, _ffdct8_coeff_
     33     vld1.16         {q2}, [r0], r2
     34     vld1.16         {q3}, [r0], r2
     35     vld1.16         {d0}, [r12]
     36     vld1.16         {q4}, [r0], r2
     37 
     38     ;First for-loop
     39     ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
     40     ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
     41     vtrn.32         d2, d6
     42     vtrn.32         d3, d7
     43     vtrn.32         d4, d8
     44     vtrn.32         d5, d9
     45     vtrn.16         d2, d4
     46     vtrn.16         d3, d5
     47     vtrn.16         d6, d8
     48     vtrn.16         d7, d9
     49 
     50     vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
     51     vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
     52     vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
     53     vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
     54     vadd.s16        d22, d3, d9
     55     vadd.s16        d23, d5, d7
     56     vsub.s16        d24, d5, d7
     57     vsub.s16        d25, d3, d9
     58 
     59     vshl.i16        q5, q5, #1              ; a1, b1
     60     vshl.i16        q6, q6, #1              ; c1, d1
     61     vshl.i16        q1, q11, #1
     62     vshl.i16        q2, q12, #1
     63 
     64     vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
     65     vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
     66     vadd.s16        d24, d2, d3
     67     vsub.s16        d25, d2, d3
     68 
     69     vqdmulh.s16     q8, q7, d0[1]
     70     vqdmulh.s16     q13, q12, d0[1]
     71     vqdmulh.s16     q10, q6, d0[0]
     72     vqdmulh.s16     q15, q2, d0[0]
     73     vqdmulh.s16     q9, q6, d0[2]
     74     vqdmulh.s16     q14, q2, d0[2]
     75 
     76     vshr.s16        q8, q8, #1
     77     vshr.s16        q13, q13, #1
     78     vshr.s16        q10, q10, #1
     79     vshr.s16        q15, q15, #1
     80     vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
     81     vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
     82     vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
     83     vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
     84 
     85     vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
     86     vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
     87     vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
     88     vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
     89     vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
     90     vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
     91     vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
     92     vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2
     93 
     94     ;Second for-loop
     95     ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
     96     ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
     97     vtrn.32         d2, d6
     98     vtrn.32         d3, d7
     99     vtrn.32         d4, d8
    100     vtrn.32         d5, d9
    101     vtrn.16         d2, d4
    102     vtrn.16         d3, d5
    103     vtrn.16         d6, d8
    104     vtrn.16         d7, d9
    105 
    106     vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
    107     vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
    108     vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
    109     vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
    110     vadd.s16        d2, d3, d9
    111     vadd.s16        d4, d5, d7
    112     vsub.s16        d24, d5, d7
    113     vsub.s16        d25, d3, d9
    114 
    115     vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
    116     vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
    117     vadd.s16        d22, d2, d4
    118     vsub.s16        d23, d2, d4
    119 
    120     vqdmulh.s16     q8, q7, d0[1]
    121     vqdmulh.s16     q13, q11, d0[1]
    122     vqdmulh.s16     q10, q6, d0[0]
    123     vqdmulh.s16     q15, q12, d0[0]
    124     vqdmulh.s16     q9, q6, d0[2]
    125     vqdmulh.s16     q14, q12, d0[2]
    126 
    127     vshr.s16        q8, q8, #1
    128     vshr.s16        q13, q13, #1
    129     vshr.s16        q10, q10, #1
    130     vshr.s16        q15, q15, #1
    131     vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
    132     vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
    133     vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
    134     vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
    135 
    136     vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
    137     vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
    138     vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
    139     vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
    140     vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
    141     vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
    142     vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
    143     vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2
    144 
    145     vclt.s16        q5, q1, #0
    146     vclt.s16        q6, q2, #0
    147     vclt.s16        q7, q3, #0
    148     vclt.s16        q8, q4, #0
    149 
    150     vsub.s16        q1, q1, q5
    151     vsub.s16        q2, q2, q6
    152     vsub.s16        q3, q3, q7
    153     vsub.s16        q4, q4, q8
    154 
    155     vshr.s16        q1, q1, #1
    156     vshr.s16        q2, q2, #1
    157     vshr.s16        q3, q3, #1
    158     vshr.s16        q4, q4, #1
    159 
    160     vst1.16         {q1, q2}, [r1]!
    161     vst1.16         {q3, q4}, [r1]
    162 
    163     bx              lr
    164 
    165     ENDP
    166 
    167 ;-----------------
    168     AREA    fastfdct8x4_dat, DATA, READONLY
    169 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
    170 ;One word each is reserved. Label filter_coeff can be used to access the data.
    171 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
    172 _ffdct8_coeff_
    173     DCD     ffdct8_coeff
    174 ffdct8_coeff
    175 ; 60547 =  0xEC83
    176 ; 46341 =  0xB505
    177 ; 25080 =  0x61F8
    178     DCD     0xB505EC83, 0x000061F8
    179 
    180     END
    181