Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_fast_fdct4x4_neon|
     13 
     14     ARM
     15     REQUIRE8
     16     PRESERVE8
     17 
     18     AREA ||.text||, CODE, READONLY, ALIGN=2
     19 ;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
     20 ;NOTE:
     21 ;The input *src_diff. src_diff is calculated as:
     22 ;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
     23 ;In which *src_ptr and *pred_ptr both are unsigned char.
     24 ;Therefore, *src_diff should be in the range of [-255, 255].
     25 ;CAUTION:
     26 ;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
     27 ;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
     28 ;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
     29 
     30 |vp8_fast_fdct4x4_neon| PROC
     31     vld1.16         {d2}, [r0], r2              ;load input
     32     ldr             r12, _ffdct_coeff_
     33     vld1.16         {d3}, [r0], r2
     34     vld1.16         {d4}, [r0], r2
     35     vld1.16         {d0}, [r12]
     36     vld1.16         {d5}, [r0], r2
     37 
     38     ;First for-loop
     39     ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
     40     vtrn.32         d2, d4
     41     vtrn.32         d3, d5
     42     vtrn.16         d2, d3
     43     vtrn.16         d4, d5
     44 
     45     vadd.s16        d6, d2, d5              ;ip[0]+ip[3]
     46     vadd.s16        d7, d3, d4              ;ip[1]+ip[2]
     47     vsub.s16        d8, d3, d4              ;ip[1]-ip[2]
     48     vsub.s16        d9, d2, d5              ;ip[0]-ip[3]
     49     vshl.i16        q3, q3, #1              ; a1, b1
     50     vshl.i16        q4, q4, #1              ; c1, d1
     51 
     52     vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
     53     vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
     54 
     55     vqdmulh.s16     q6, q5, d0[1]
     56     vqdmulh.s16     q8, q4, d0[0]
     57     vqdmulh.s16     q7, q4, d0[2]
     58 
     59     vshr.s16        q6, q6, #1
     60     vshr.s16        q8, q8, #1
     61     vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
     62     vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
     63 
     64     vadd.s16        d2, d10, d12            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
     65     vadd.s16        d4, d11, d13            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
     66     vadd.s16        d3, d14, d17            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
     67     vsub.s16        d5, d15, d16            ;op[3] = temp1 - temp2
     68 
     69     ;Second for-loop
     70     ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
     71     vtrn.32         d2, d4
     72     vtrn.32         d3, d5
     73     vtrn.16         d2, d3
     74     vtrn.16         d4, d5
     75 
     76     vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
     77     vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
     78     vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
     79     vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
     80 
     81     vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
     82     vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
     83 
     84 
     85     vqdmulh.s16     q6, q5, d0[1]
     86     vqdmulh.s16     q8, q4, d0[0]
     87     vqdmulh.s16     q7, q4, d0[2]
     88 
     89     vshr.s16        q6, q6, #1
     90     vshr.s16        q8, q8, #1
     91     vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
     92     vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
     93 
     94     vadd.s16        d2, d10, d12            ;a2 = ((temp1 * x_c2 )>>16) + temp1
     95     vadd.s16        d4, d11, d13            ;c2 = ((temp2 * x_c2 )>>16) + temp2
     96     vadd.s16        d3, d14, d17            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
     97     vsub.s16        d5, d15, d16            ;d2 = temp1 - temp2
     98 
     99     vclt.s16        q3, q1, #0
    100     vclt.s16        q4, q2, #0
    101 
    102     vsub.s16        q1, q1, q3
    103     vsub.s16        q2, q2, q4
    104 
    105     vshr.s16        q1, q1, #1
    106     vshr.s16        q2, q2, #1
    107 
    108     vst1.16         {q1, q2}, [r1]
    109 
    110     bx              lr
    111 
    112     ENDP
    113 
    114 ;-----------------
    115 
    116 _ffdct_coeff_
    117     DCD     ffdct_coeff
    118 ffdct_coeff
    119 ; 60547 =  0xEC83
    120 ; 46341 =  0xB505
    121 ; 25080 =  0x61F8
    122     DCD     0xB505EC83, 0x000061F8
    123 
    124     END
    125