Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerHalf function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21 
     22     IF :DEF: H264DEC_WINASM
     23         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     24     ELSE
     25         REQUIRE8
     26         PRESERVE8
     27     ENDIF
     28 
     29     AREA    |.text|, CODE
     30 
     31 ;// h264bsdInterpolateVerHalf register allocation
     32 
     33 ref     RN 0
     34 
     35 mb      RN 1
     36 buff    RN 1
     37 
     38 count   RN 2
     39 x0      RN 2
     40 
     41 res     RN 3
     42 y0      RN 3
     43 
     44 tmp1    RN 4
     45 
     46 tmp2    RN 5
     47 height  RN 5
     48 
     49 tmp3    RN 6
     50 partW   RN 6
     51 
     52 tmp4    RN 7
     53 partH   RN 7
     54 
     55 tmp5    RN 8
     56 tmp6    RN 9
     57 
     58 tmpa    RN 10
     59 tmpb    RN 11
     60 width   RN 12
     61 
     62 plus16  RN 14
     63 
     64 
     65 ;// function exports and imports
     66 
     67     IMPORT  h264bsdFillBlock
     68 
     69     EXPORT  h264bsdInterpolateVerHalf
     70 
     71 ;// Approach to vertical interpolation
     72 ;//
     73 ;// Interpolation is done by using 32-bit loads and stores
     74 ;// and by using 16 bit arithmetic. 4x4 block is processed
     75 ;// in each round.
     76 ;//
     77 ;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
     78 ;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
     79 ;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
     80 ;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
     81 ;//           ..
     82 ;//           ..
     83 ;// |a_m1|a_m1|a_m1|a_m1|...
     84 ;// |b_m1|b_m1|b_m1|b_m1|...
     85 ;// |c_m1|c_m1|c_m1|c_m1|...
     86 ;// |d_m1|d_m1|d_m1|d_m1|...
     87 
     88 h264bsdInterpolateVerHalf
     89     STMFD   sp!, {r0-r11, lr}
     90     SUB     sp, sp, #0x1e4
     91 
     92     CMP     x0, #0
     93     BLT     do_fill                 ;// (x0 < 0)
     94     LDR     partW, [sp,#0x220]      ;// partWidth
     95     ADD     tmp5, x0, partW         ;// (x0+partWidth)
     96     LDR     width, [sp,#0x218]      ;// width
     97     CMP     tmp5, width
     98     BHI     do_fill                 ;// (x0+partW)>width
     99 
    100     CMP     y0, #0
    101     BLT     do_fill                 ;// (y0 < 0)
    102     LDR     partH, [sp,#0x224]      ;// partHeight
    103     ADD     tmp6, y0, partH         ;// (y0+partHeight)
    104     ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
    105     LDR     height, [sp,#0x21c]     ;// height
    106     CMP     tmp6, height
    107     BLS     skip_fill               ;// no overfill needed
    108 
    109 
    110 do_fill
    111     LDR     partH, [sp,#0x224]      ;// partHeight
    112     ADD     tmp5, partH, #5         ;// r2 = partH + 5;
    113     LDR     height, [sp,#0x21c]     ;// height
    114     LDR     partW, [sp,#0x220]      ;// partWidth
    115     STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
    116     STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
    117     STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
    118     LDR     width, [sp,#0x218]      ;// width
    119     STR     width, [sp,#0]          ;// sp+0 = width
    120     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    121     BL      h264bsdFillBlock
    122 
    123     MOV     x0, #0
    124     STR     x0,[sp,#0x1ec]          ;// x0 = 0
    125     STR     x0,[sp,#0x1f0]          ;// y0 = 0
    126     ADD     ref,sp,#0x28            ;// ref = p1
    127     STR     partW, [sp,#0x218]
    128 
    129 
    130 skip_fill
    131     LDR     x0 ,[sp,#0x1ec]         ;// x0
    132     LDR     y0 ,[sp,#0x1f0]         ;// y0
    133     LDR     width, [sp,#0x218]      ;// width
    134     MLA     tmp6, width, y0, x0     ;// y0*width+x0
    135     ADD     ref, ref, tmp6          ;// ref += y0*width+x0
    136     LDR     mb, [sp, #0x1e8]        ;// mb
    137 
    138     ADD     count, partW, partH, LSL #16    ;// |partH|partW|
    139     LDR     tmp5, = 0x00010001
    140     SSUB16  count, count, tmp5;     ;// |partH-1|partW-1|
    141     LDR     plus16, = 0x00100010
    142 
    143     AND     tmp1, count, #0x000000FF ;// partWidth
    144 
    145 
    146 loop_y
    147     ADD     count, count, tmp1, LSL #24  ;// partWidth-1 to top byte
    148 
    149 loop_x
    150     LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
    151     LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
    152     LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
    153     LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
    154     LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
    155     LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
    156 
    157     ;// first four pixels
    158     UXTB16  tmpa, tmp3                  ;// |g3|g1|
    159     UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
    160     UXTB16  tmpb, tmp2                  ;// |c3|c1|
    161     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    162 
    163     UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
    164     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    165     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
    166     UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
    167 
    168     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    169     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    170 
    171     USAT16  tmpb, #13, tmpa             ;// saturate
    172     LDR     res, = 0x00FF00FF
    173     UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
    174     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
    175     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    176 
    177     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    178     UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
    179     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    180     UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
    181     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
    182     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
    183 
    184     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    185     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    186 
    187     USAT16  tmpb, #13, tmpa             ;// saturate
    188     LDR     tmp1, [ref], width
    189     LDR     tmpa, = 0xFF00FF00
    190 
    191     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
    192     ORR     res, res, tmpa
    193     STR     res, [mb], #16              ;// next row (mb)
    194 
    195     ;// tmp2 = |a4|a3|a2|a1|
    196     ;// tmp3 = |c4|c3|c2|c1|
    197     ;// tmp4 = |g4|g3|g2|g1|
    198     ;// tmp5 = |m4|m3|m2|m1|
    199     ;// tmp6 = |r4|r3|r2|r1|
    200     ;// tmp1 = |t4|t3|t2|t1|
    201 
    202     ;// second four pixels
    203     UXTB16  tmpa, tmp4                  ;// |g3|g1|
    204     UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
    205     UXTB16  tmpb, tmp3                  ;// |c3|c1|
    206     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    207     UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
    208     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    209     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
    210     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
    211 
    212     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    213     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    214 
    215     USAT16  tmpb, #13, tmpa             ;// saturate
    216     LDR     res, = 0x00FF00FF
    217     UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
    218     UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
    219     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    220 
    221     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    222     UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
    223     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    224     UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
    225     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
    226     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
    227 
    228     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    229     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    230 
    231     USAT16  tmpb, #13, tmpa             ;// saturate
    232     LDR     tmp2, [ref], width
    233     LDR     tmpa, = 0xFF00FF00
    234 
    235     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    236     ORR     res, res, tmpa
    237     STR     res, [mb], #16              ;// next row
    238 
    239     ;// tmp3 = |a4|a3|a2|a1|
    240     ;// tmp4 = |c4|c3|c2|c1|
    241     ;// tmp5 = |g4|g3|g2|g1|
    242     ;// tmp6 = |m4|m3|m2|m1|
    243     ;// tmp1 = |r4|r3|r2|r1|
    244     ;// tmp2 = |t4|t3|t2|t1|
    245 
    246     ;// third four pixels
    247     UXTB16  tmpa, tmp5                  ;// |g3|g1|
    248     UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
    249     UXTB16  tmpb, tmp4                  ;// |c3|c1|
    250     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    251     UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
    252     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    253     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
    254     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
    255 
    256     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    257     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    258 
    259     USAT16  tmpb, #13, tmpa             ;// saturate
    260     LDR     res, = 0x00FF00FF
    261     UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
    262     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
    263     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    264 
    265     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    266     UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
    267     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    268     UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
    269     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
    270     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
    271 
    272 
    273     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    274     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    275 
    276     USAT16  tmpb, #13, tmpa             ;// saturate
    277     LDR     tmp3, [ref]
    278     LDR     tmpa, = 0xFF00FF00
    279 
    280     ;// decrement loop_x counter
    281     SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
    282 
    283     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    284     ORR     res, res, tmpa
    285     STR     res, [mb], #16              ;// next row
    286 
    287     ;// tmp4 = |a4|a3|a2|a1|
    288     ;// tmp5 = |c4|c3|c2|c1|
    289     ;// tmp6 = |g4|g3|g2|g1|
    290     ;// tmp1 = |m4|m3|m2|m1|
    291     ;// tmp2 = |r4|r3|r2|r1|
    292     ;// tmp3 = |t4|t3|t2|t1|
    293 
    294     ;// fourth four pixels
    295     UXTB16  tmpa, tmp6                  ;// |g3|g1|
    296     UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
    297     UXTB16  tmpb, tmp5                  ;// |c3|c1|
    298     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    299     UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
    300     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    301     UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
    302     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
    303 
    304     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    305     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    306 
    307     USAT16  tmpb, #13, tmpa             ;// saturate
    308     LDR     res, = 0x00FF00FF
    309     UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
    310     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
    311     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    312 
    313     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    314     UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
    315     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    316     UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
    317     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
    318     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
    319 
    320     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    321     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    322 
    323     USAT16  tmpb, #13, tmpa             ;// saturate
    324     LDR     tmp4, = 0xFF00FF00
    325 
    326     ;// calculate "ref" address for next round
    327     SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
    328     ADD     ref, ref, #4;               ;// next column (4 pixels)
    329     AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
    330     ORR     res, res, tmpa
    331     STR     res, [mb], #-44
    332 
    333     BCS     loop_x
    334 
    335     ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
    336     ADD     ref, ref, width, LSL #2     ;// ref += 4*width
    337     AND     tmp1, count, #0x000000FF    ;// partWidth-1
    338     ADD     tmp2, tmp1, #1              ;// partWidth
    339     SUB     ref, ref, tmp2              ;// ref -= partWidth
    340     ADD     mb, mb, #64;
    341     SUB     mb, mb, tmp2;               ;// mb -= partWidth
    342     BGE     loop_y
    343 
    344     ADD     sp,sp,#0x1f4
    345     LDMFD   sp!, {r4-r11, pc}
    346 
    347     END
    348