Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_short_idct4x4llm_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA ||.text||, CODE, READONLY, ALIGN=2
     18 
     19 ;*************************************************************
     20 ;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
     21 ;                            unsigned char *dst, int stride)
     22 ;r0 short * input
     23 ;r1 short * pred
     24 ;r2 int pitch
     25 ;r3 unsigned char dst
     26 ;sp int stride
     27 ;*************************************************************
     28 
     29 ; static const int cospi8sqrt2minus1=20091;
     30 ; static const int sinpi8sqrt2      =35468;
     31 ; static const int rounding = 0;
     32 
     33 ; Optimization note: The resulted data from dequantization are signed
     34 ; 13-bit data that is in the range of [-4096, 4095]. This allows to
     35 ; use "vqdmulh"(neon) instruction since it won't go out of range
     36 ; (13+16+1=30bits<32bits). This instruction gives the high half
     37 ; result of the multiplication that is needed in IDCT.
     38 
     39 |vp8_short_idct4x4llm_neon| PROC
     40     adr             r12, idct_coeff
     41     vld1.16         {q1, q2}, [r0]
     42     vld1.16         {d0}, [r12]
     43 
     44     vswp            d3, d4                  ;q2(vp[4] vp[12])
     45     ldr             r0, [sp]                ; stride
     46 
     47     vqdmulh.s16     q3, q2, d0[2]
     48     vqdmulh.s16     q4, q2, d0[0]
     49 
     50     vqadd.s16       d12, d2, d3             ;a1
     51     vqsub.s16       d13, d2, d3             ;b1
     52 
     53     vshr.s16        q3, q3, #1
     54     vshr.s16        q4, q4, #1
     55 
     56     vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
     57     vqadd.s16       q4, q4, q2
     58 
     59     ;d6 - c1:temp1
     60     ;d7 - d1:temp2
     61     ;d8 - d1:temp1
     62     ;d9 - c1:temp2
     63 
     64     vqsub.s16       d10, d6, d9             ;c1
     65     vqadd.s16       d11, d7, d8             ;d1
     66 
     67     vqadd.s16       d2, d12, d11
     68     vqadd.s16       d3, d13, d10
     69     vqsub.s16       d4, d13, d10
     70     vqsub.s16       d5, d12, d11
     71 
     72     vtrn.32         d2, d4
     73     vtrn.32         d3, d5
     74     vtrn.16         d2, d3
     75     vtrn.16         d4, d5
     76 
     77     vswp            d3, d4
     78 
     79     vqdmulh.s16     q3, q2, d0[2]
     80     vqdmulh.s16     q4, q2, d0[0]
     81 
     82     vqadd.s16       d12, d2, d3             ;a1
     83     vqsub.s16       d13, d2, d3             ;b1
     84 
     85     vshr.s16        q3, q3, #1
     86     vshr.s16        q4, q4, #1
     87 
     88     vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
     89     vqadd.s16       q4, q4, q2
     90 
     91     vqsub.s16       d10, d6, d9             ;c1
     92     vqadd.s16       d11, d7, d8             ;d1
     93 
     94     vqadd.s16       d2, d12, d11
     95     vqadd.s16       d3, d13, d10
     96     vqsub.s16       d4, d13, d10
     97     vqsub.s16       d5, d12, d11
     98 
     99     vrshr.s16       d2, d2, #3
    100     vrshr.s16       d3, d3, #3
    101     vrshr.s16       d4, d4, #3
    102     vrshr.s16       d5, d5, #3
    103 
    104     vtrn.32         d2, d4
    105     vtrn.32         d3, d5
    106     vtrn.16         d2, d3
    107     vtrn.16         d4, d5
    108 
    109     ; load prediction data
    110     vld1.32         d6[0], [r1], r2
    111     vld1.32         d6[1], [r1], r2
    112     vld1.32         d7[0], [r1], r2
    113     vld1.32         d7[1], [r1], r2
    114 
    115     ; add prediction and residual
    116     vaddw.u8        q1, q1, d6
    117     vaddw.u8        q2, q2, d7
    118 
    119     vqmovun.s16     d1, q1
    120     vqmovun.s16     d2, q2
    121 
    122     ; store to destination
    123     vst1.32         d1[0], [r3], r0
    124     vst1.32         d1[1], [r3], r0
    125     vst1.32         d2[0], [r3], r0
    126     vst1.32         d2[1], [r3], r0
    127 
    128     bx              lr
    129 
    130     ENDP
    131 
    132 ;-----------------
    133 
    134 idct_coeff
    135     DCD     0x4e7b4e7b, 0x8a8c8a8c
    136 
    137 ;20091, 20091, 35468, 35468
    138 
    139     END
    140