Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10     EXPORT  |vp8_short_inv_walsh4x4_neon|
     11     EXPORT  |vp8_short_inv_walsh4x4_1_neon|
     12 
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA    |.text|, CODE, READONLY  ; name this block of code
     18 
     19 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
     20 |vp8_short_inv_walsh4x4_neon| PROC
     21 
     22     ; read in all four lines of values: d0->d3
     23     vldm.64 r0, {q0, q1}
     24 
     25     ; first for loop
     26 
     27     vadd.s16 d4, d0, d3 ;a = [0] + [12]
     28     vadd.s16 d5, d1, d2 ;b = [4] + [8]
     29     vsub.s16 d6, d1, d2 ;c = [4] - [8]
     30     vsub.s16 d7, d0, d3 ;d = [0] - [12]
     31 
     32     vadd.s16 d0, d4, d5 ;a + b
     33     vadd.s16 d1, d6, d7 ;c + d
     34     vsub.s16 d2, d4, d5 ;a - b
     35     vsub.s16 d3, d7, d6 ;d - c
     36 
     37     vtrn.32 d0, d2 ;d0:  0  1  8  9
     38                    ;d2:  2  3 10 11
     39     vtrn.32 d1, d3 ;d1:  4  5 12 13
     40                    ;d3:  6  7 14 15
     41 
     42     vtrn.16 d0, d1 ;d0:  0  4  8 12
     43                    ;d1:  1  5  9 13
     44     vtrn.16 d2, d3 ;d2:  2  6 10 14
     45                    ;d3:  3  7 11 15
     46 
     47     ; second for loop
     48 
     49     vadd.s16 d4, d0, d3 ;a = [0] + [3]
     50     vadd.s16 d5, d1, d2 ;b = [1] + [2]
     51     vsub.s16 d6, d1, d2 ;c = [1] - [2]
     52     vsub.s16 d7, d0, d3 ;d = [0] - [3]
     53 
     54     vadd.s16 d0, d4, d5 ;e = a + b
     55     vadd.s16 d1, d6, d7 ;f = c + d
     56     vsub.s16 d2, d4, d5 ;g = a - b
     57     vsub.s16 d3, d7, d6 ;h = d - c
     58 
     59     vmov.i16 q2, #3
     60     vadd.i16 q0, q0, q2 ;e/f += 3
     61     vadd.i16 q1, q1, q2 ;g/h += 3
     62 
     63     vshr.s16 q0, q0, #3 ;e/f >> 3
     64     vshr.s16 q1, q1, #3 ;g/h >> 3
     65 
     66     vtrn.32 d0, d2
     67     vtrn.32 d1, d3
     68     vtrn.16 d0, d1
     69     vtrn.16 d2, d3
     70 
     71     vstmia.16 r1!, {q0}
     72     vstmia.16 r1!, {q1}
     73 
     74     bx lr
     75     ENDP    ; |vp8_short_inv_walsh4x4_neon|
     76 
     77 
     78 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
     79 |vp8_short_inv_walsh4x4_1_neon| PROC
     80     ; load a full line into a neon register
     81     vld1.16  {q0}, [r0]
     82     ; extract first element and replicate
     83     vdup.16 q1, d0[0]
     84     ; add 3 to all values
     85     vmov.i16 q2, #3
     86     vadd.i16 q3, q1, q2
     87     ; right shift
     88     vshr.s16 q3, q3, #3
     89     ; write it back
     90     vstmia.16 r1!, {q3}
     91     vstmia.16 r1!, {q3}
     92 
     93     bx lr
     94     ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
     95 
     96     END
     97