Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21 
     22     IF :DEF: H264DEC_WINASM
     23         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     24     ELSE
     25         REQUIRE8
     26         PRESERVE8
     27     ENDIF
     28 
     29     AREA    |.text|, CODE
     30 
     31 ;// h264bsdInterpolateHorHalf register allocation
     32 
     33 ref     RN 0
     34 
     35 mb      RN 1
     36 buff    RN 1
     37 
     38 count   RN 2
     39 x0      RN 2
     40 
     41 y0      RN 3
     42 x_2_0   RN 3
     43 
     44 width   RN 4
     45 x_3_1   RN 4
     46 
     47 height  RN 5
     48 x_6_4   RN 5
     49 
     50 partW   RN 6
     51 x_7_5   RN 6
     52 
     53 partH   RN 7
     54 tmp1    RN 7
     55 
     56 tmp2    RN 8
     57 
     58 tmp3    RN 9
     59 
     60 tmp4    RN 10
     61 
     62 mult_20_01  RN 11
     63 mult_20_m5  RN 12
     64 
     65 plus16  RN 14
     66 
     67 
     68 ;// function exports and imports
     69 
     70     IMPORT  h264bsdFillBlock
     71 
     72     EXPORT  h264bsdInterpolateHorHalf
     73 
     74 ;// Horizontal filter approach
     75 ;//
     76 ;// Basic idea in horizontal filtering is to adjust coefficients
     77 ;// like below. Calculation is done with 16-bit maths.
     78 ;//
     79 ;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
     80 ;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
     81 ;// y_0 =   20  1     20 -5        -5         1
     82 ;// y_1 =   -5        20  1      1 20        -5
     83 ;// y_2 =    1        -5        -5 20      1 20
     84 ;// y_3 =              1        20 -5     -5 20         1
     85 
     86 
     87 h264bsdInterpolateHorHalf
     88     STMFD   sp!, {r0-r11, lr}
     89     SUB     sp, sp, #0x1e4
     90 
     91     CMP     x0, #0
     92     BLT     do_fill                 ;// (x0 < 0)
     93     LDR     partW, [sp,#0x220]      ;// partWidth
     94     ADD     tmp4, x0, partW         ;// (x0+partWidth)
     95     ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
     96     LDR     width, [sp,#0x218]      ;// width
     97     CMP     tmp4, width
     98     BHI     do_fill                 ;// (x0+partW)>width
     99 
    100     CMP     y0, #0
    101     BLT     do_fill                 ;// (y0 < 0)
    102     LDR     partH, [sp,#0x224]      ;// partHeight
    103     ADD     tmp2, y0, partH         ;// (y0+partHeight)
    104     LDR     height, [sp,#0x21c]     ;// height
    105     CMP     tmp2, height
    106     BLS     skip_fill               ;// no overfill needed
    107 
    108 
    109 do_fill
    110     LDR     partH, [sp,#0x224]      ;// partHeight
    111     LDR     height, [sp,#0x21c]     ;// height
    112     LDR     partW, [sp,#0x220]      ;// partWidth
    113     ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
    114     STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
    115     STR     partH, [sp,#0xc]        ;// sp+c = partHeight
    116     STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
    117     LDR     width, [sp,#0x218]      ;// width
    118     STR     width, [sp,#0]          ;// sp+0 = width
    119     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    120     BL      h264bsdFillBlock
    121 
    122     MOV     x0, #0
    123     STR     x0,[sp,#0x1ec]          ;// x0 = 0
    124     STR     x0,[sp,#0x1f0]          ;// y0 = 0
    125     ADD     ref,sp,#0x28            ;// ref = p1
    126     STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
    127 
    128 
    129 skip_fill
    130     LDR     x0 ,[sp,#0x1ec]         ;// x0
    131     LDR     y0 ,[sp,#0x1f0]         ;// y0
    132     LDR     width, [sp,#0x218]      ;// width
    133     MLA     tmp2, width, y0, x0     ;// y0*width+x0
    134     ADD     ref, ref, tmp2          ;// ref += y0*width+x0
    135     ADD     ref, ref, #8            ;// ref = ref+8
    136     LDR     mb, [sp, #0x1e8]        ;// mb
    137 
    138     ;// pack values to count register
    139     ;// [31:28] loop_x (partWidth-1)
    140     ;// [27:24] loop_y (partHeight-1)
    141     ;// [23:20] partWidth-1
    142     ;// [19:16] partHeight-1
    143     ;// [15:00] width
    144     MOV     count, width
    145     SUB     partW, partW, #1;
    146     SUB     partH, partH, #1;
    147     ADD     tmp2, partH, partW, LSL #4
    148     ADD     count, count, tmp2, LSL #16
    149 
    150 
    151     LDR     mult_20_01, = 0x00140001
    152     LDR     mult_20_m5, = 0x0014FFFB
    153     MOV     plus16, #16
    154     AND     tmp1, count, #0x000F0000    ;// partHeight-1
    155     AND     tmp3, count, #0x00F00000    ;// partWidth-1
    156     ADD     count, count, tmp1, LSL #8
    157 loop_y
    158     LDR     x_3_1, [ref, #-8]
    159     ADD     count, count, tmp3, LSL #8
    160     LDR     x_7_5, [ref, #-4]
    161     UXTB16  x_2_0, x_3_1
    162     UXTB16  x_3_1, x_3_1, ROR #8
    163     UXTB16  x_6_4, x_7_5
    164 
    165 loop_x
    166     UXTB16  x_7_5, x_7_5, ROR #8
    167 
    168     SMLAD   tmp1, x_2_0, mult_20_01, plus16
    169     SMLATB  tmp3, x_2_0, mult_20_01, plus16
    170     SMLATB  tmp2, x_2_0, mult_20_m5, plus16
    171     SMLATB  tmp4, x_3_1, mult_20_01, plus16
    172 
    173     SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
    174     SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    175     SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    176     LDR     x_3_1, [ref], #4
    177     SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
    178 
    179     SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    180     SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    181     SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    182     SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
    183 
    184     SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    185     UXTB16  x_2_0, x_3_1
    186     SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    187     SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    188     SMLABB  tmp4, x_2_0, mult_20_01, tmp4
    189 
    190     MOV     tmp2, tmp2, ASR #5
    191     MOV     tmp1, tmp1, ASR #5
    192     PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    193     PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    194     USAT16  tmp2, #8, tmp2
    195     USAT16  tmp1, #8, tmp1
    196 
    197     SUBS    count, count, #4<<28
    198     ORR     tmp1, tmp1, tmp2, LSL #8
    199     STR     tmp1, [mb], #4
    200     BCC     next_y
    201 
    202     UXTB16  x_3_1, x_3_1, ROR #8
    203 
    204     SMLAD   tmp1, x_6_4, mult_20_01, plus16
    205     SMLATB  tmp3, x_6_4, mult_20_01, plus16
    206     SMLATB  tmp2, x_6_4, mult_20_m5, plus16
    207     SMLATB  tmp4, x_7_5, mult_20_01, plus16
    208 
    209     SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    210     SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    211     SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    212     LDR     x_7_5, [ref], #4
    213     SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
    214 
    215     SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    216     SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    217     SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    218     SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
    219 
    220     SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    221     UXTB16  x_6_4, x_7_5
    222     SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    223     SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    224     SMLABB  tmp4, x_6_4, mult_20_01, tmp4
    225 
    226     MOV     tmp2, tmp2, ASR #5
    227     MOV     tmp1, tmp1, ASR #5
    228     PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    229     PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    230     USAT16  tmp2, #8, tmp2
    231     USAT16  tmp1, #8, tmp1
    232 
    233     SUBS    count, count, #4<<28
    234     ORR     tmp1, tmp1, tmp2, LSL #8
    235     STR     tmp1, [mb], #4
    236     BCS     loop_x
    237 
    238 next_y
    239     AND     tmp3, count, #0x00F00000    ;// partWidth-1
    240     SMLABB  ref, count, mult_20_01, ref ;// +width
    241     ADDS    mb, mb, #16                 ;// +16, Carry=0
    242     SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
    243     SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
    244     ADDS    count, count, #(1<<28)-(1<<24)
    245     BGE     loop_y
    246 
    247     ADD     sp,sp,#0x1f4
    248     LDMFD   sp!, {r4-r11, pc}
    249 
    250     END
    251 
    252