Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_dequant_idct_add_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA ||.text||, CODE, READONLY, ALIGN=2
     18 ;void vp8_dequant_idct_add_neon(short *input, short *dq,
     19 ;                           unsigned char *dest, int stride)
     20 ; r0    short *input,
     21 ; r1    short *dq,
     22 ; r2    unsigned char *dest
     23 ; r3    int stride
     24 
     25 |vp8_dequant_idct_add_neon| PROC
     26     vld1.16         {q3, q4}, [r0]
     27     vld1.16         {q5, q6}, [r1]
     28 
     29     add             r1, r2, r3              ; r1 = dest + stride
     30     lsl             r3, #1                  ; 2x stride
     31 
     32     vld1.32         {d14[0]}, [r2], r3
     33     vld1.32         {d14[1]}, [r1], r3
     34     vld1.32         {d15[0]}, [r2]
     35     vld1.32         {d15[1]}, [r1]
     36 
     37     adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
     38 
     39     vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
     40     vmul.i16        q2, q4, q6
     41 
     42 ;|short_idct4x4llm_neon| PROC
     43     vld1.16         {d0}, [r12]
     44     vswp            d3, d4                  ;q2(vp[4] vp[12])
     45 
     46     vqdmulh.s16     q3, q2, d0[2]
     47     vqdmulh.s16     q4, q2, d0[0]
     48 
     49     vqadd.s16       d12, d2, d3             ;a1
     50     vqsub.s16       d13, d2, d3             ;b1
     51 
     52     vshr.s16        q3, q3, #1
     53     vshr.s16        q4, q4, #1
     54 
     55     vqadd.s16       q3, q3, q2
     56     vqadd.s16       q4, q4, q2
     57 
     58     vqsub.s16       d10, d6, d9             ;c1
     59     vqadd.s16       d11, d7, d8             ;d1
     60 
     61     vqadd.s16       d2, d12, d11
     62     vqadd.s16       d3, d13, d10
     63     vqsub.s16       d4, d13, d10
     64     vqsub.s16       d5, d12, d11
     65 
     66     vtrn.32         d2, d4
     67     vtrn.32         d3, d5
     68     vtrn.16         d2, d3
     69     vtrn.16         d4, d5
     70 
     71 ; memset(input, 0, 32) -- 32bytes
     72     vmov.i16        q14, #0
     73 
     74     vswp            d3, d4
     75     vqdmulh.s16     q3, q2, d0[2]
     76     vqdmulh.s16     q4, q2, d0[0]
     77 
     78     vqadd.s16       d12, d2, d3             ;a1
     79     vqsub.s16       d13, d2, d3             ;b1
     80 
     81     vmov            q15, q14
     82 
     83     vshr.s16        q3, q3, #1
     84     vshr.s16        q4, q4, #1
     85 
     86     vqadd.s16       q3, q3, q2
     87     vqadd.s16       q4, q4, q2
     88 
     89     vqsub.s16       d10, d6, d9             ;c1
     90     vqadd.s16       d11, d7, d8             ;d1
     91 
     92     vqadd.s16       d2, d12, d11
     93     vqadd.s16       d3, d13, d10
     94     vqsub.s16       d4, d13, d10
     95     vqsub.s16       d5, d12, d11
     96 
     97     vst1.16         {q14, q15}, [r0]
     98 
     99     vrshr.s16       d2, d2, #3
    100     vrshr.s16       d3, d3, #3
    101     vrshr.s16       d4, d4, #3
    102     vrshr.s16       d5, d5, #3
    103 
    104     vtrn.32         d2, d4
    105     vtrn.32         d3, d5
    106     vtrn.16         d2, d3
    107     vtrn.16         d4, d5
    108 
    109     vaddw.u8        q1, q1, d14
    110     vaddw.u8        q2, q2, d15
    111 
    112     sub             r2, r2, r3
    113     sub             r1, r1, r3
    114 
    115     vqmovun.s16     d0, q1
    116     vqmovun.s16     d1, q2
    117 
    118     vst1.32         {d0[0]}, [r2], r3
    119     vst1.32         {d0[1]}, [r1], r3
    120     vst1.32         {d1[0]}, [r2]
    121     vst1.32         {d1[1]}, [r1]
    122 
    123     bx             lr
    124 
    125     ENDP           ; |vp8_dequant_idct_add_neon|
    126 
    127 ; Constant Pool
    128 cospi8sqrt2minus1 DCD 0x4e7b4e7b
    129 sinpi8sqrt2       DCD 0x8a8c8a8c
    130 
    131     END
    132