Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11     EXPORT |vp8_short_walsh4x4_armv6|
     12 
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA    |.text|, CODE, READONLY  ; name this block of code
     18 
     19 ;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
     20 ; r0    short *input,
     21 ; r1    short *output,
     22 ; r2    int pitch
     23 |vp8_short_walsh4x4_armv6| PROC
     24 
     25     stmdb       sp!, {r4 - r11, lr}
     26 
     27     ldrd        r4, r5, [r0], r2
     28     ldr         lr, c00040004
     29     ldrd        r6, r7, [r0], r2
     30 
     31     ; 0-3
     32     qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
     33     qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
     34 
     35     ldrd        r8, r9, [r0], r2
     36     ; 4-7
     37     qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
     38     qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
     39 
     40     ldrd        r10, r11, [r0]
     41     ; 8-11
     42     qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
     43     qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
     44 
     45     ; 12-15
     46     qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
     47     qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
     48 
     49 
     50     lsls        r2, r3, #16
     51     smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
     52     addne       r11, r11, #1        ; A0 += (a1!=0)
     53 
     54     lsls        r2, r7, #16
     55     smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
     56     addne       r12, r12, #1        ; C0 += (a1!=0)
     57 
     58     add         r0, r11, r12        ; a1_0 = A0 + C0
     59     sub         r11, r11, r12       ; b1_0 = A0 - C0
     60 
     61     lsls        r2, r5, #16
     62     smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
     63     addne       r12, r12, #1        ; B0 += (a1!=0)
     64 
     65     lsls        r2, r9, #16
     66     smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
     67     addne       r2, r2, #1          ; D0 += (a1!=0)
     68 
     69     add         lr, r12, r2         ; d1_0 = B0 + D0
     70     sub         r12, r12, r2        ; c1_0 = B0 - D0
     71 
     72     ; op[0,4,8,12]
     73     adds        r2, r0, lr          ; a2 = a1_0 + d1_0
     74     addmi       r2, r2, #1          ; += a2 < 0
     75     add         r2, r2, #3          ; += 3
     76     subs        r0, r0, lr          ; d2 = a1_0 - d1_0
     77     mov         r2, r2, asr #3      ; >> 3
     78     strh        r2, [r1]            ; op[0]
     79 
     80     addmi       r0, r0, #1          ; += a2 < 0
     81     add         r0, r0, #3          ; += 3
     82     ldr         lr, c00040004
     83     mov         r0, r0, asr #3      ; >> 3
     84     strh        r0, [r1, #24]       ; op[12]
     85 
     86     adds        r2, r11, r12        ; b2 = b1_0 + c1_0
     87     addmi       r2, r2, #1          ; += a2 < 0
     88     add         r2, r2, #3          ; += 3
     89     subs        r0, r11, r12        ; c2 = b1_0 - c1_0
     90     mov         r2, r2, asr #3      ; >> 3
     91     strh        r2, [r1, #8]        ; op[4]
     92 
     93     addmi       r0, r0, #1          ; += a2 < 0
     94     add         r0, r0, #3          ; += 3
     95     smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
     96     smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
     97     mov         r0, r0, asr #3      ; >> 3
     98     strh        r0, [r1, #16]       ; op[8]
     99 
    100 
    101     ; op[3,7,11,15]
    102     add         r0, r3, r7          ; a1_3 = A3 + C3
    103     sub         r3, r3, r7          ; b1_3 = A3 - C3
    104 
    105     smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
    106     smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
    107     add         r7, r5, r9          ; d1_3 = B3 + D3
    108     sub         r5, r5, r9          ; c1_3 = B3 - D3
    109 
    110     adds        r2, r0, r7          ; a2 = a1_3 + d1_3
    111     addmi       r2, r2, #1          ; += a2 < 0
    112     add         r2, r2, #3          ; += 3
    113     adds        r9, r3, r5          ; b2 = b1_3 + c1_3
    114     mov         r2, r2, asr #3      ; >> 3
    115     strh        r2, [r1, #6]        ; op[3]
    116 
    117     addmi       r9, r9, #1          ; += a2 < 0
    118     add         r9, r9, #3          ; += 3
    119     subs        r2, r3, r5          ; c2 = b1_3 - c1_3
    120     mov         r9, r9, asr #3      ; >> 3
    121     strh        r9, [r1, #14]       ; op[7]
    122 
    123     addmi       r2, r2, #1          ; += a2 < 0
    124     add         r2, r2, #3          ; += 3
    125     subs        r9, r0, r7          ; d2 = a1_3 - d1_3
    126     mov         r2, r2, asr #3      ; >> 3
    127     strh        r2, [r1, #22]       ; op[11]
    128 
    129     addmi       r9, r9, #1          ; += a2 < 0
    130     add         r9, r9, #3          ; += 3
    131     smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
    132     smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
    133     mov         r9, r9, asr #3      ; >> 3
    134     strh        r9, [r1, #30]       ; op[15]
    135 
    136     ; op[1,5,9,13]
    137     add         r0, r3, r5          ; a1_1 = A1 + C1
    138     sub         r3, r3, r5          ; b1_1 = A1 - C1
    139 
    140     smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
    141     smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
    142     add         r5, r7, r9          ; d1_1 = B1 + D1
    143     sub         r7, r7, r9          ; c1_1 = B1 - D1
    144 
    145     adds        r2, r0, r5          ; a2 = a1_1 + d1_1
    146     addmi       r2, r2, #1          ; += a2 < 0
    147     add         r2, r2, #3          ; += 3
    148     adds        r9, r3, r7          ; b2 = b1_1 + c1_1
    149     mov         r2, r2, asr #3      ; >> 3
    150     strh        r2, [r1, #2]        ; op[1]
    151 
    152     addmi       r9, r9, #1          ; += a2 < 0
    153     add         r9, r9, #3          ; += 3
    154     subs        r2, r3, r7          ; c2 = b1_1 - c1_1
    155     mov         r9, r9, asr #3      ; >> 3
    156     strh        r9, [r1, #10]       ; op[5]
    157 
    158     addmi       r2, r2, #1          ; += a2 < 0
    159     add         r2, r2, #3          ; += 3
    160     subs        r9, r0, r5          ; d2 = a1_1 - d1_1
    161     mov         r2, r2, asr #3      ; >> 3
    162     strh        r2, [r1, #18]       ; op[9]
    163 
    164     addmi       r9, r9, #1          ; += a2 < 0
    165     add         r9, r9, #3          ; += 3
    166     smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
    167     smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
    168     mov         r9, r9, asr #3      ; >> 3
    169     strh        r9, [r1, #26]       ; op[13]
    170 
    171 
    172     ; op[2,6,10,14]
    173     add         r11, r4, r8         ; a1_2 = A2 + C2
    174     sub         r12, r4, r8         ; b1_2 = A2 - C2
    175 
    176     smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
    177     smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
    178     add         r4, r6, r10         ; d1_2 = B2 + D2
    179     sub         r8, r6, r10         ; c1_2 = B2 - D2
    180 
    181     adds        r2, r11, r4         ; a2 = a1_2 + d1_2
    182     addmi       r2, r2, #1          ; += a2 < 0
    183     add         r2, r2, #3          ; += 3
    184     adds        r9, r12, r8         ; b2 = b1_2 + c1_2
    185     mov         r2, r2, asr #3      ; >> 3
    186     strh        r2, [r1, #4]        ; op[2]
    187 
    188     addmi       r9, r9, #1          ; += a2 < 0
    189     add         r9, r9, #3          ; += 3
    190     subs        r2, r12, r8         ; c2 = b1_2 - c1_2
    191     mov         r9, r9, asr #3      ; >> 3
    192     strh        r9, [r1, #12]       ; op[6]
    193 
    194     addmi       r2, r2, #1          ; += a2 < 0
    195     add         r2, r2, #3          ; += 3
    196     subs        r9, r11, r4         ; d2 = a1_2 - d1_2
    197     mov         r2, r2, asr #3      ; >> 3
    198     strh        r2, [r1, #20]       ; op[10]
    199 
    200     addmi       r9, r9, #1          ; += a2 < 0
    201     add         r9, r9, #3          ; += 3
    202     mov         r9, r9, asr #3      ; >> 3
    203     strh        r9, [r1, #28]       ; op[14]
    204 
    205 
    206     ldmia       sp!, {r4 - r11, pc}
    207     ENDP        ; |vp8_short_walsh4x4_armv6|
    208 
    209 c00040004
    210     DCD         0x00040004
    211 
    212     END
    213