Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license and patent
      5 ;  grant that can be found in the LICENSE file in the root of the source
      6 ;  tree. All contributing project authors may be found in the AUTHORS
      7 ;  file in the root of the source tree.
      8 ;
      9 
     10     EXPORT  |vp9_idct32x32_1_add_neon|
     11     ARM
     12     REQUIRE8
     13     PRESERVE8
     14 
     15     AREA ||.text||, CODE, READONLY, ALIGN=2
     16 
     17     ;TODO(hkuang): put the following macros in a seperate
     18     ;file so other idct function could also use them.
     19     MACRO
     20     LD_16x8          $src, $stride
     21     vld1.8           {q8}, [$src], $stride
     22     vld1.8           {q9}, [$src], $stride
     23     vld1.8           {q10}, [$src], $stride
     24     vld1.8           {q11}, [$src], $stride
     25     vld1.8           {q12}, [$src], $stride
     26     vld1.8           {q13}, [$src], $stride
     27     vld1.8           {q14}, [$src], $stride
     28     vld1.8           {q15}, [$src], $stride
     29     MEND
     30 
     31     MACRO
     32     ADD_DIFF_16x8    $diff
     33     vqadd.u8         q8, q8, $diff
     34     vqadd.u8         q9, q9, $diff
     35     vqadd.u8         q10, q10, $diff
     36     vqadd.u8         q11, q11, $diff
     37     vqadd.u8         q12, q12, $diff
     38     vqadd.u8         q13, q13, $diff
     39     vqadd.u8         q14, q14, $diff
     40     vqadd.u8         q15, q15, $diff
     41     MEND
     42 
     43     MACRO
     44     SUB_DIFF_16x8    $diff
     45     vqsub.u8         q8, q8, $diff
     46     vqsub.u8         q9, q9, $diff
     47     vqsub.u8         q10, q10, $diff
     48     vqsub.u8         q11, q11, $diff
     49     vqsub.u8         q12, q12, $diff
     50     vqsub.u8         q13, q13, $diff
     51     vqsub.u8         q14, q14, $diff
     52     vqsub.u8         q15, q15, $diff
     53     MEND
     54 
     55     MACRO
     56     ST_16x8          $dst, $stride
     57     vst1.8           {q8}, [$dst], $stride
     58     vst1.8           {q9}, [$dst], $stride
     59     vst1.8           {q10},[$dst], $stride
     60     vst1.8           {q11},[$dst], $stride
     61     vst1.8           {q12},[$dst], $stride
     62     vst1.8           {q13},[$dst], $stride
     63     vst1.8           {q14},[$dst], $stride
     64     vst1.8           {q15},[$dst], $stride
     65     MEND
     66 
     67 ;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
     68 ;                              int dest_stride)
     69 ;
     70 ; r0  int16_t input
     71 ; r1  uint8_t *dest
     72 ; r2  int dest_stride
     73 
     74 |vp9_idct32x32_1_add_neon| PROC
     75     push             {lr}
     76     pld              [r1]
     77     add              r3, r1, #16               ; r3 dest + 16 for second loop
     78     ldrsh            r0, [r0]
     79 
     80     ; generate cospi_16_64 = 11585
     81     mov              r12, #0x2d00
     82     add              r12, #0x41
     83 
     84     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     85     mul              r0, r0, r12               ; input[0] * cospi_16_64
     86     add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
     87     asr              r0, r0, #14               ; >> DCT_CONST_BITS
     88 
     89     ; out = dct_const_round_shift(out * cospi_16_64)
     90     mul              r0, r0, r12               ; out * cospi_16_64
     91     mov              r12, r1                   ; save dest
     92     add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
     93     asr              r0, r0, #14               ; >> DCT_CONST_BITS
     94 
     95     ; a1 = ROUND_POWER_OF_TWO(out, 6)
     96     add              r0, r0, #32               ; + (1 <<((6) - 1))
     97     asrs             r0, r0, #6                ; >> 6
     98     bge              diff_positive_32_32
     99 
    100 diff_negative_32_32
    101     neg              r0, r0
    102     usat             r0, #8, r0
    103     vdup.u8          q0, r0
    104     mov              r0, #4
    105 
    106 diff_negative_32_32_loop
    107     sub              r0, #1
    108     LD_16x8          r1, r2
    109     SUB_DIFF_16x8    q0
    110     ST_16x8          r12, r2
    111 
    112     LD_16x8          r1, r2
    113     SUB_DIFF_16x8    q0
    114     ST_16x8          r12, r2
    115     cmp              r0, #2
    116     moveq            r1, r3
    117     moveq            r12, r3
    118     cmp              r0, #0
    119     bne              diff_negative_32_32_loop
    120     pop              {pc}
    121 
    122 diff_positive_32_32
    123     usat             r0, #8, r0
    124     vdup.u8          q0, r0
    125     mov              r0, #4
    126 
    127 diff_positive_32_32_loop
    128     sub              r0, #1
    129     LD_16x8          r1, r2
    130     ADD_DIFF_16x8    q0
    131     ST_16x8          r12, r2
    132 
    133     LD_16x8          r1, r2
    134     ADD_DIFF_16x8    q0
    135     ST_16x8          r12, r2
    136     cmp              r0, #2
    137     moveq            r1, r3
    138     moveq            r12, r3
    139     cmp              r0, #0
    140     bne              diff_positive_32_32_loop
    141     pop              {pc}
    142 
    143     ENDP             ; |vp9_idct32x32_1_add_neon|
    144     END
    145