Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21     IF :DEF: H264DEC_WINASM
     22         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     23     ELSE
     24         REQUIRE8
     25         PRESERVE8
     26     ENDIF
     27 
     28     AREA    |.text|, CODE
     29 
     30 ;// h264bsdInterpolateVerQuarter register allocation
     31 
     32 ref     RN 0
     33 
     34 mb      RN 1
     35 buff    RN 1
     36 
     37 count   RN 2
     38 x0      RN 2
     39 
     40 res     RN 3
     41 y0      RN 3
     42 
     43 tmp1    RN 4
     44 
     45 tmp2    RN 5
     46 height  RN 5
     47 
     48 tmp3    RN 6
     49 partW   RN 6
     50 
     51 tmp4    RN 7
     52 partH   RN 7
     53 
     54 tmp5    RN 8
     55 tmp6    RN 9
     56 
     57 tmpa    RN 10
     58 tmpb    RN 11
     59 width   RN 12
     60 
     61 plus16  RN 14
     62 
     63 
     64 ;// function exports and imports
     65 
     66     IMPORT  h264bsdFillBlock
     67 
     68     EXPORT  h264bsdInterpolateVerQuarter
     69 
     70 ;// Approach to vertical interpolation
     71 ;//
     72 ;// Interpolation is done by using 32-bit loads and stores
     73 ;// and by using 16 bit arithmetic. 4x4 block is processed
     74 ;// in each round.
     75 ;//
     76 ;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
     77 ;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
     78 ;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
     79 ;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
     80 ;//           ..
     81 ;//           ..
     82 ;// |a_m1|a_m1|a_m1|a_m1|...
     83 ;// |b_m1|b_m1|b_m1|b_m1|...
     84 ;// |c_m1|c_m1|c_m1|c_m1|...
     85 ;// |d_m1|d_m1|d_m1|d_m1|...
     86 
     87 h264bsdInterpolateVerQuarter
     88     STMFD   sp!, {r0-r11, lr}
     89     SUB     sp, sp, #0x1e4
     90 
     91     CMP     x0, #0
     92     BLT     do_fill                 ;// (x0 < 0)
     93     LDR     partW, [sp,#0x220]      ;// partWidth
     94     ADD     tmp5, x0, partW         ;// (x0+partWidth)
     95     LDR     width, [sp,#0x218]      ;// width
     96     CMP     tmp5, width
     97     BHI     do_fill                 ;// (x0+partW)>width
     98 
     99     CMP     y0, #0
    100     BLT     do_fill                 ;// (y0 < 0)
    101     LDR     partH, [sp,#0x224]      ;// partHeight
    102     ADD     tmp6, y0, partH         ;// (y0+partHeight)
    103     ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
    104     LDR     height, [sp,#0x21c]     ;// height
    105     CMP     tmp6, height
    106     BLS     skip_fill               ;// no overfill needed
    107 
    108 
    109 do_fill
    110     LDR     partH, [sp,#0x224]      ;// partHeight
    111     ADD     tmp5, partH, #5         ;// r2 = partH + 5;
    112     LDR     height, [sp,#0x21c]     ;// height
    113     LDR     partW, [sp,#0x220]      ;// partWidth
    114     STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
    115     STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
    116     STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
    117     LDR     width, [sp,#0x218]      ;// width
    118     STR     width, [sp,#0]          ;// sp+0 = width
    119     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    120     BL      h264bsdFillBlock
    121 
    122     MOV     x0, #0
    123     STR     x0,[sp,#0x1ec]          ;// x0 = 0
    124     STR     x0,[sp,#0x1f0]          ;// y0 = 0
    125     ADD     ref,sp,#0x28            ;// ref = p1
    126     STR     partW, [sp,#0x218]
    127 
    128 
    129 skip_fill
    130     LDR     x0 ,[sp,#0x1ec]         ;// x0
    131     LDR     y0 ,[sp,#0x1f0]         ;// y0
    132     LDR     width, [sp,#0x218]      ;// width
    133     MLA     tmp6, width, y0, x0     ;// y0*width+x0
    134     ADD     ref, ref, tmp6          ;// ref += y0*width+x0
    135     LDR     mb, [sp, #0x1e8]        ;// mb
    136 
    137     ADD     count, partW, partH, LSL #8    ;// |xx|xx|partH|partW|
    138     LDR     tmp5, = 0x00010100
    139     RSB     count, tmp5, count, LSL #8      ;// |xx|partH-1|partW-1|xx|
    140     LDR     tmp2, [sp, #0x228]      ;// verOffset
    141     ADD     count, count, tmp2      ;// |xx|partH-1|partW-1|verOffset|
    142     LDR     plus16, = 0x00100010
    143 
    144     AND     tmp1, count, #0x0000FF00 ;// partWidth
    145 
    146 
    147 loop_y
    148     ADD     count, count, tmp1, LSL #16  ;// partWidth-1 to top byte
    149 
    150 loop_x
    151     LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
    152     LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
    153     LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
    154     LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
    155     LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
    156     LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
    157 
    158     ;// first four pixels
    159     UXTB16  tmpa, tmp3                  ;// |g3|g1|
    160     UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
    161     UXTB16  tmpb, tmp2                  ;// |c3|c1|
    162     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    163 
    164     UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
    165     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    166     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
    167     UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
    168 
    169     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    170     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    171 
    172     USAT16  tmpb, #13, tmpa             ;// saturate
    173     LDR     res, = 0x00FF00FF
    174     UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
    175     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
    176     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    177 
    178     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    179     UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
    180     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    181     UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
    182     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
    183     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
    184 
    185     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    186     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    187 
    188     USAT16  tmpb, #13, tmpa             ;// saturate
    189     MOVS    tmp1, count, LSL #31        ;// update flags (verOffset)
    190     LDR     tmpa, = 0xFF00FF00
    191     MVNEQ   tmp1, tmp3                  ;// select verOffset=0
    192     MVNNE   tmp1, tmp4                  ;// select verOffset=1
    193     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
    194     ORR     res, res, tmpa
    195 
    196     LDR     tmpa, = 0x80808080
    197     UHSUB8  res, res, tmp1              ;// bilinear interpolation
    198     LDR     tmp1, [ref], width          ;// load next row
    199     EOR     res, res, tmpa              ;// correct sign
    200 
    201     STR     res, [mb], #16              ;// next row (mb)
    202 
    203 
    204     ;// tmp2 = |a4|a3|a2|a1|
    205     ;// tmp3 = |c4|c3|c2|c1|
    206     ;// tmp4 = |g4|g3|g2|g1|
    207     ;// tmp5 = |m4|m3|m2|m1|
    208     ;// tmp6 = |r4|r3|r2|r1|
    209     ;// tmp1 = |t4|t3|t2|t1|
    210 
    211     ;// second four pixels
    212     UXTB16  tmpa, tmp4                  ;// |g3|g1|
    213     UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
    214     UXTB16  tmpb, tmp3                  ;// |c3|c1|
    215     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    216     UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
    217     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    218     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
    219     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
    220 
    221     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    222     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    223 
    224     USAT16  tmpb, #13, tmpa             ;// saturate
    225     LDR     res, = 0x00FF00FF
    226     UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
    227     UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
    228     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    229 
    230     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    231     UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
    232     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    233     UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
    234     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
    235     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
    236 
    237     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    238     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    239 
    240     USAT16  tmpb, #13, tmpa             ;// saturate
    241     LDR     tmpa, = 0xFF00FF00
    242     MVNEQ   tmp2, tmp4                  ;// select verOffset=0
    243     MVNNE   tmp2, tmp5                  ;// select verOffset=1
    244 
    245     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    246     ORR     res, res, tmpa
    247     LDR     tmpa, = 0x80808080
    248     UHSUB8  res, res, tmp2              ;// bilinear interpolation
    249     LDR     tmp2, [ref], width          ;// load next row
    250     EOR     res, res, tmpa              ;// correct sign
    251     STR     res, [mb], #16              ;// next row
    252 
    253     ;// tmp3 = |a4|a3|a2|a1|
    254     ;// tmp4 = |c4|c3|c2|c1|
    255     ;// tmp5 = |g4|g3|g2|g1|
    256     ;// tmp6 = |m4|m3|m2|m1|
    257     ;// tmp1 = |r4|r3|r2|r1|
    258     ;// tmp2 = |t4|t3|t2|t1|
    259 
    260     ;// third four pixels
    261     UXTB16  tmpa, tmp5                  ;// |g3|g1|
    262     UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
    263     UXTB16  tmpb, tmp4                  ;// |c3|c1|
    264     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    265     UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
    266     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    267     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
    268     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
    269 
    270     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    271     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    272 
    273     USAT16  tmpb, #13, tmpa             ;// saturate
    274     LDR     res, = 0x00FF00FF
    275     UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
    276     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
    277     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    278 
    279     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    280     UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
    281     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    282     UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
    283     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
    284     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
    285 
    286 
    287     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    288     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    289 
    290     USAT16  tmpb, #13, tmpa             ;// saturate
    291     LDR     tmpa, = 0xFF00FF00
    292     MVNEQ   tmp3, tmp5                  ;// select verOffset=0
    293     MVNNE   tmp3, tmp6                  ;// select verOffset=1
    294 
    295     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    296     ORR     res, res, tmpa
    297     LDR     tmpa, = 0x80808080
    298     UHSUB8  res, res, tmp3              ;// bilinear interpolation
    299     LDR     tmp3, [ref]                 ;// load next row
    300     EOR     res, res, tmpa              ;// correct sign
    301     STR     res, [mb], #16              ;// next row
    302 
    303     ;// tmp4 = |a4|a3|a2|a1|
    304     ;// tmp5 = |c4|c3|c2|c1|
    305     ;// tmp6 = |g4|g3|g2|g1|
    306     ;// tmp1 = |m4|m3|m2|m1|
    307     ;// tmp2 = |r4|r3|r2|r1|
    308     ;// tmp3 = |t4|t3|t2|t1|
    309 
    310     ;// fourth four pixels
    311     UXTB16  tmpa, tmp6                  ;// |g3|g1|
    312     UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
    313     UXTB16  tmpb, tmp5                  ;// |c3|c1|
    314     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    315     UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
    316     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    317     UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
    318     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
    319 
    320     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    321     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    322 
    323     USAT16  tmpb, #13, tmpa             ;// saturate
    324     LDR     res, = 0x00FF00FF
    325     UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
    326     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
    327     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    328 
    329     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    330     UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
    331     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    332     UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
    333     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
    334     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
    335 
    336     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    337     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    338 
    339     USAT16  tmpb, #13, tmpa             ;// saturate
    340     LDR     tmp4, = 0xFF00FF00
    341     MVNEQ   tmp5, tmp6                  ;// select verOffset=0
    342     MVNNE   tmp5, tmp1                  ;// select verOffset=1
    343 
    344     AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
    345     ORR     res, res, tmpa
    346     LDR     tmpa, = 0x80808080
    347     UHSUB8  res, res, tmp5              ;// bilinear interpolation
    348 
    349     ;// decrement loop_x counter
    350     SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
    351 
    352     ;// calculate "ref" address for next round
    353     SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
    354     ADD     ref, ref, #4;               ;// next column (4 pixels)
    355 
    356     EOR     res, res, tmpa              ;// correct sign
    357     STR     res, [mb], #-44
    358 
    359     BCS     loop_x
    360 
    361     ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
    362     ADD     ref, ref, width, LSL #2     ;// ref += 4*width
    363     AND     tmp1, count, #0x0000FF00    ;// partWidth-1
    364     MOV     tmp2, #1
    365     ADD     tmp2, tmp2, tmp1, LSR #8    ;// partWidth
    366     SUB     ref, ref, tmp2              ;// ref -= partWidth
    367     ADD     mb, mb, #64;
    368     SUB     mb, mb, tmp2;               ;// mb -= partWidth
    369     BGE     loop_y
    370 
    371     ADD     sp,sp,#0x1f4
    372     LDMFD   sp!, {r4-r11, pc}
    373 
    374     END
    375