Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11     EXPORT |vp8_short_fdct4x4_armv6|
     12 
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     AREA    |.text|, CODE, READONLY
     18 ; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
     19 |vp8_short_fdct4x4_armv6| PROC
     20 
     21     stmfd       sp!, {r4 - r12, lr}
     22 
     23     ; PART 1
     24 
     25     ; coeffs 0-3
     26     ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
     27 
     28     ldr         r10, c7500
     29     ldr         r11, c14500
     30     ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
     31     ldr         lr, c0x00080008
     32     ror         r5, r5, #16         ; [i2 | i3]
     33 
     34     qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
     35     qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
     36 
     37     add         r0, r0, r2          ; update input pointer
     38 
     39     qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
     40                                     ; with 2217*4 and 5352*4 without losing the
     41                                     ; sign bit (overflow)
     42 
     43     smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
     44     smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
     45 
     46     smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
     47     smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
     48 
     49     ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
     50 
     51     pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
     52     pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
     53 
     54     str         r6, [r1, #4]
     55 
     56     ; coeffs 4-7
     57     ror         r9, r9, #16         ; [i6 | i7]
     58 
     59     qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
     60     qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
     61 
     62     add         r0, r0, r2          ; update input pointer
     63 
     64     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
     65                                     ; with 2217*4 and 5352*4 without losing the
     66                                     ; sign bit (overflow)
     67 
     68     smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
     69     smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
     70 
     71     smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
     72     smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
     73 
     74     ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
     75 
     76     pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
     77     pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
     78 
     79     str         r6, [r1, #12]
     80 
     81     ; coeffs 8-11
     82     ror         r5, r5, #16         ; [i10 | i11]
     83 
     84     qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
     85     qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
     86 
     87     add         r0, r0, r2          ; update input pointer
     88 
     89     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
     90                                     ; with 2217*4 and 5352*4 without losing the
     91                                     ; sign bit (overflow)
     92 
     93     smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
     94     smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
     95 
     96     smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
     97     smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
     98 
     99     ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
    100 
    101     pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
    102     pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
    103 
    104     str         r6, [r1, #20]
    105 
    106     ; coeffs 12-15
    107     ror         r5, r5, #16         ; [i14 | i15]
    108 
    109     qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
    110     qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
    111 
    112     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
    113                                     ; with 2217*4 and 5352*4 without losing the
    114                                     ; sign bit (overflow)
    115 
    116     smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
    117     smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
    118 
    119     smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
    120     smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
    121 
    122     pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
    123     pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
    124 
    125     str         r6, [r1, #28]
    126 
    127 
    128     ; PART 2 -------------------------------------------------
    129     ldr         r11, c12000
    130     ldr         r10, c51000
    131     ldr         lr, c0x00070007
    132 
    133     qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
    134     qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
    135     qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
    136     qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
    137 
    138     qadd16      r4, r4, lr          ; a1 + 7
    139 
    140     add         r0, r11, #0x10000   ; add (d!=0)
    141 
    142     qadd16      r2, r4, r5          ; a1 + b1 + 7
    143     qsub16      r3, r4, r5          ; a1 - b1 + 7
    144 
    145     ldr         r12, c0x08a914e8    ; [2217 | 5352]
    146 
    147     lsl         r8, r2, #16         ; prepare bottom halfword for scaling
    148     asr         r2, r2, #4          ; scale top halfword
    149     lsl         r9, r3, #16         ; prepare bottom halfword for scaling
    150     asr         r3, r3, #4          ; scale top halfword
    151     pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
    152     pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
    153 
    154     smulbt      r2, r6, r12         ; [ ------ | c1*2217]
    155     str         r4, [r1, #0]        ; [     o1 |      o0]
    156     smultt      r3, r6, r12         ; [c1*2217 | ------ ]
    157     str         r5, [r1, #16]       ; [     o9 |      o8]
    158 
    159     smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
    160     smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
    161 
    162     smulbb      r2, r6, r12         ; [ ------ | c1*5352]
    163     smultb      r3, r6, r12         ; [c1*5352 | ------ ]
    164 
    165     lsls        r6, r7, #16         ; d1 != 0 ?
    166     addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
    167     addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
    168     asrs        r6, r7, #16
    169     addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
    170     addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
    171 
    172     smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
    173     smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
    174 
    175     pkhtb       r9, r9, r8, asr #16
    176 
    177     sub         r4, r4, r2
    178     sub         r5, r5, r3
    179 
    180     ldr         r3, [r1, #4]        ; [i3 | i2]
    181 
    182     pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
    183 
    184     str         r9, [r1, #8]        ; [o5 | 04]
    185 
    186     ldr         r9, [r1, #12]       ; [i7 | i6]
    187     ldr         r8, [r1, #28]       ; [i15|i14]
    188     ldr         r2, [r1, #20]       ; [i11|i10]
    189     str         r5, [r1, #24]       ; [o13|o12]
    190 
    191     qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
    192     qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
    193 
    194     qadd16      r4, r4, lr          ; a1 + 7
    195 
    196     qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
    197     qadd16      r2, r4, r5          ; a1 + b1 + 7
    198     qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
    199     qsub16      r3, r4, r5          ; a1 - b1 + 7
    200 
    201     lsl         r8, r2, #16         ; prepare bottom halfword for scaling
    202     asr         r2, r2, #4          ; scale top halfword
    203     lsl         r9, r3, #16         ; prepare bottom halfword for scaling
    204     asr         r3, r3, #4          ; scale top halfword
    205     pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
    206     pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
    207 
    208     smulbt      r2, r6, r12         ; [ ------ | c1*2217]
    209     str         r4, [r1, #4]        ; [     o3 |      o2]
    210     smultt      r3, r6, r12         ; [c1*2217 | ------ ]
    211     str         r5, [r1, #20]       ; [    o11 |     o10]
    212 
    213     smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
    214     smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
    215 
    216     smulbb      r2, r6, r12         ; [ ------ | c1*5352]
    217     smultb      r3, r6, r12         ; [c1*5352 | ------ ]
    218 
    219     lsls        r6, r7, #16         ; d1 != 0 ?
    220     addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
    221     addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
    222 
    223     asrs        r6, r7, #16
    224     addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
    225     addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
    226 
    227     smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
    228     smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
    229 
    230     pkhtb       r9, r9, r8, asr #16
    231 
    232     sub         r4, r4, r2
    233     sub         r5, r5, r3
    234 
    235     str         r9, [r1, #12]       ; [o7 | o6]
    236     pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
    237 
    238     str         r5, [r1, #28]       ; [o15|o14]
    239 
    240     ldmfd       sp!, {r4 - r12, pc}
    241 
    242     ENDP
    243 
    244 ; Used constants
    245 c7500
    246     DCD     7500
    247 c14500
    248     DCD     14500
    249 c0x22a453a0
    250     DCD     0x22a453a0
    251 c0x00080008
    252     DCD     0x00080008
    253 c12000
    254     DCD     12000
    255 c51000
    256     DCD     51000
    257 c0x00070007
    258     DCD     0x00070007
    259 c0x08a914e8
    260     DCD     0x08a914e8
    261 
    262     END
    263