Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11     EXPORT  |vp9_iht4x4_16_add_neon|
     12     ARM
     13     REQUIRE8
     14     PRESERVE8
     15 
     16     AREA ||.text||, CODE, READONLY, ALIGN=2
     17 
     18     ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
     19     ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
     20     ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
     21     ; into d16-d19 registers. This macro will touch q10- q15 registers and use
     22     ; them as buffer during calculation.
     23     MACRO
     24     IDCT4x4_1D
     25     ; stage 1
     26     vadd.s16    d23, d16, d18   ; (input[0] + input[2])
     27     vsub.s16    d24, d16, d18   ; (input[0] - input[2])
     28 
     29     vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
     30     vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
     31     vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
     32     vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
     33     vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
     34     vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
     35 
     36     ; dct_const_round_shift
     37     vqrshrn.s32 d26, q13, #14
     38     vqrshrn.s32 d27, q14, #14
     39     vqrshrn.s32 d29, q15, #14
     40     vqrshrn.s32 d28, q10, #14
     41 
     42     ; stage 2
     43     ; output[0] = step[0] + step[3];
     44     ; output[1] = step[1] + step[2];
     45     ; output[3] = step[0] - step[3];
     46     ; output[2] = step[1] - step[2];
     47     vadd.s16    q8,  q13, q14
     48     vsub.s16    q9,  q13, q14
     49     vswp        d18, d19
     50     MEND
     51 
     52     ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
     53     ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
     54     ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
     55     ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
     56     ; q14,q15 registers and use them as buffer during calculation.
     57     MACRO
     58     IADST4x4_1D
     59     vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
     60     vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
     61     vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
     62     vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
     63     vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
     64     vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
     65     vaddw.s16   q15, q15, d19   ; x0 + x3
     66     vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
     67     vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
     68     vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
     69 
     70     vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
     71     vadd.s32    q10, q10, q8
     72     vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
     73     vdup.32     q8, r0          ; duplicate sinpi_3_9
     74     vsub.s32    q11, q11, q9
     75     vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
     76 
     77     vadd.s32    q13, q10, q12   ; s0 = x0 + x3
     78     vadd.s32    q10, q10, q11   ; x0 + x1
     79     vadd.s32    q14, q11, q12   ; s1 = x1 + x3
     80     vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
     81 
     82     ; dct_const_round_shift
     83     vqrshrn.s32 d16, q13, #14
     84     vqrshrn.s32 d17, q14, #14
     85     vqrshrn.s32 d18, q15, #14
     86     vqrshrn.s32 d19, q10, #14
     87     MEND
     88 
     89     ; Generate cosine constants in d6 - d8 for the IDCT
     90     MACRO
     91     GENERATE_COSINE_CONSTANTS
     92     ; cospi_8_64 = 15137 = 0x3b21
     93     mov         r0, #0x3b00
     94     add         r0, #0x21
     95     ; cospi_16_64 = 11585 = 0x2d41
     96     mov         r3, #0x2d00
     97     add         r3, #0x41
     98     ; cospi_24_64 = 6270 = 0x187e
     99     mov         r12, #0x1800
    100     add         r12, #0x7e
    101 
    102     ; generate constant vectors
    103     vdup.16     d0, r0          ; duplicate cospi_8_64
    104     vdup.16     d1, r3          ; duplicate cospi_16_64
    105     vdup.16     d2, r12         ; duplicate cospi_24_64
    106     MEND
    107 
    108     ; Generate sine constants in d1 - d4 for the IADST.
    109     MACRO
    110     GENERATE_SINE_CONSTANTS
    111     ; sinpi_1_9 = 5283 = 0x14A3
    112     mov         r0, #0x1400
    113     add         r0, #0xa3
    114     ; sinpi_2_9 = 9929 = 0x26C9
    115     mov         r3, #0x2600
    116     add         r3, #0xc9
    117     ; sinpi_4_9 = 15212 = 0x3B6C
    118     mov         r12, #0x3b00
    119     add         r12, #0x6c
    120 
    121     ; generate constant vectors
    122     vdup.16     d3, r0          ; duplicate sinpi_1_9
    123 
    124     ; sinpi_3_9 = 13377 = 0x3441
    125     mov         r0, #0x3400
    126     add         r0, #0x41
    127 
    128     vdup.16     d4, r3          ; duplicate sinpi_2_9
    129     vdup.16     d5, r12         ; duplicate sinpi_4_9
    130     vdup.16     q3, r0          ; duplicate sinpi_3_9
    131     MEND
    132 
    133     ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
    134     MACRO
    135     TRANSPOSE4X4
    136     vtrn.16     d16, d17
    137     vtrn.16     d18, d19
    138     vtrn.32     q8, q9
    139     MEND
    140 
    141     AREA     Block, CODE, READONLY ; name this block of code
    142 ;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
    143 ;                               int dest_stride, int tx_type)
    144 ;
    145 ; r0  int16_t input
    146 ; r1  uint8_t *dest
    147 ; r2  int dest_stride
    148 ; r3  int tx_type)
    149 ; This function will only handle tx_type of 1,2,3.
    150 |vp9_iht4x4_16_add_neon| PROC
    151 
    152     ; load the inputs into d16-d19
    153     vld1.s16    {q8,q9}, [r0]!
    154 
    155     ; transpose the input data
    156     TRANSPOSE4X4
    157 
    158     ; decide the type of transform
    159     cmp         r3, #2
    160     beq         idct_iadst
    161     cmp         r3, #3
    162     beq         iadst_iadst
    163 
    164 iadst_idct
    165     ; generate constants
    166     GENERATE_COSINE_CONSTANTS
    167     GENERATE_SINE_CONSTANTS
    168 
    169     ; first transform rows
    170     IDCT4x4_1D
    171 
    172     ; transpose the matrix
    173     TRANSPOSE4X4
    174 
    175     ; then transform columns
    176     IADST4x4_1D
    177 
    178     b end_vp9_iht4x4_16_add_neon
    179 
    180 idct_iadst
    181     ; generate constants
    182     GENERATE_COSINE_CONSTANTS
    183     GENERATE_SINE_CONSTANTS
    184 
    185     ; first transform rows
    186     IADST4x4_1D
    187 
    188     ; transpose the matrix
    189     TRANSPOSE4X4
    190 
    191     ; then transform columns
    192     IDCT4x4_1D
    193 
    194     b end_vp9_iht4x4_16_add_neon
    195 
    196 iadst_iadst
    197     ; generate constants
    198     GENERATE_SINE_CONSTANTS
    199 
    200     ; first transform rows
    201     IADST4x4_1D
    202 
    203     ; transpose the matrix
    204     TRANSPOSE4X4
    205 
    206     ; then transform columns
    207     IADST4x4_1D
    208 
    209 end_vp9_iht4x4_16_add_neon
    210     ; ROUND_POWER_OF_TWO(temp_out[j], 4)
    211     vrshr.s16   q8, q8, #4
    212     vrshr.s16   q9, q9, #4
    213 
    214     vld1.32     {d26[0]}, [r1], r2
    215     vld1.32     {d26[1]}, [r1], r2
    216     vld1.32     {d27[0]}, [r1], r2
    217     vld1.32     {d27[1]}, [r1]
    218 
    219     ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
    220     vaddw.u8    q8, q8, d26
    221     vaddw.u8    q9, q9, d27
    222 
    223     ; clip_pixel
    224     vqmovun.s16 d26, q8
    225     vqmovun.s16 d27, q9
    226 
    227     ; do the stores in reverse order with negative post-increment, by changing
    228     ; the sign of the stride
    229     rsb         r2, r2, #0
    230     vst1.32     {d27[1]}, [r1], r2
    231     vst1.32     {d27[0]}, [r1], r2
    232     vst1.32     {d26[1]}, [r1], r2
    233     vst1.32     {d26[0]}, [r1]  ; no post-increment
    234     bx          lr
    235     ENDP  ; |vp9_iht4x4_16_add_neon|
    236 
    237     END
    238