Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21 
     22     IF :DEF: H264DEC_WINASM
     23         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     24     ELSE
     25         REQUIRE8
     26         PRESERVE8
     27     ENDIF
     28 
     29     AREA    |.text|, CODE
     30 
     31 ;// h264bsdInterpolateHorQuarter register allocation
     32 
     33 ref     RN 0
     34 
     35 mb      RN 1
     36 buff    RN 1
     37 
     38 count   RN 2
     39 x0      RN 2
     40 
     41 y0      RN 3
     42 x_2_0   RN 3
     43 
     44 width   RN 4
     45 x_3_1   RN 4
     46 
     47 height  RN 5
     48 x_6_4   RN 5
     49 
     50 partW   RN 6
     51 x_7_5   RN 6
     52 
     53 partH   RN 7
     54 tmp1    RN 7
     55 
     56 tmp2    RN 8
     57 
     58 tmp3    RN 9
     59 
     60 tmp4    RN 10
     61 
     62 mult_20_01  RN 11
     63 
     64 mult_20_m5  RN 12
     65 
     66 plus16  RN 14
     67 
     68 
     69 ;// function exports and imports
     70 
     71     IMPORT  h264bsdFillBlock
     72 
     73     EXPORT  h264bsdInterpolateHorQuarter
     74 
     75 
     76 ;// Horizontal filter approach
     77 ;//
     78 ;// Basic idea in horizontal filtering is to adjust coefficients
     79 ;// like below. Calculation is done with 16-bit maths.
     80 ;//
     81 ;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
     82 ;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
     83 ;// y_0 =   20  1     20 -5        -5         1
     84 ;// y_1 =   -5        20  1      1 20        -5
     85 ;// y_2 =    1        -5        -5 20      1 20
     86 ;// y_3 =              1        20 -5     -5 20         1
     87 
     88 
     89 h264bsdInterpolateHorQuarter
     90     STMFD   sp!, {r0-r11, lr}
     91     SUB     sp, sp, #0x1e4
     92 
     93     CMP     x0, #0
     94     BLT     do_fill                 ;// (x0 < 0)
     95     LDR     partW, [sp,#0x220]      ;// partWidth
     96     ADD     tmp4, x0, partW         ;// (x0+partWidth)
     97     ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
     98     LDR     width, [sp,#0x218]      ;// width
     99     CMP     tmp4, width
    100     BHI     do_fill                 ;// (x0+partW)>width
    101 
    102     CMP     y0, #0
    103     BLT     do_fill                 ;// (y0 < 0)
    104     LDR     partH, [sp,#0x224]      ;// partHeight
    105     ADD     tmp2, y0, partH         ;// (y0+partHeight)
    106     LDR     height, [sp,#0x21c]     ;// height
    107     CMP     tmp2, height
    108     BLS     skip_fill               ;// no overfill needed
    109 
    110 
    111 do_fill
    112     LDR     partH, [sp,#0x224]      ;// partHeight
    113     LDR     height, [sp,#0x21c]     ;// height
    114     LDR     partW, [sp,#0x220]      ;// partWidth
    115     ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
    116     STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
    117     STR     partH, [sp,#0xc]        ;// sp+c = partHeight
    118     STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
    119     LDR     width, [sp,#0x218]      ;// width
    120     STR     width, [sp,#0]          ;// sp+0 = width
    121     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    122     BL      h264bsdFillBlock
    123 
    124     MOV     x0, #0
    125     STR     x0,[sp,#0x1ec]          ;// x0 = 0
    126     STR     x0,[sp,#0x1f0]          ;// y0 = 0
    127     ADD     ref,sp,#0x28            ;// ref = p1
    128     STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
    129 
    130 
    131 skip_fill
    132     LDR     x0 ,[sp,#0x1ec]         ;// x0
    133     LDR     y0 ,[sp,#0x1f0]         ;// y0
    134     LDR     width, [sp,#0x218]      ;// width
    135     MLA     tmp2, width, y0, x0     ;// y0*width+x0
    136     ADD     ref, ref, tmp2          ;// ref += y0*width+x0
    137     ADD     ref, ref, #8            ;// ref = ref+8
    138     LDR     mb, [sp, #0x1e8]        ;// mb
    139 
    140     ;// pack values to count register
    141     ;// [31:28] loop_x (partWidth-1)
    142     ;// [27:24] loop_y (partHeight-1)
    143     ;// [23:20] partWidth-1
    144     ;// [19:16] partHeight-1
    145     ;// [15:00] width
    146     MOV     count, width
    147     SUB     partW, partW, #1;
    148     SUB     partH, partH, #1;
    149     ADD     tmp2, partH, partW, LSL #4
    150     ADD     count, count, tmp2, LSL #16
    151 
    152 
    153     LDR     mult_20_01, = 0x00140001
    154     LDR     mult_20_m5, = 0x0014FFFB
    155     MOV     plus16, #16
    156     AND     tmp1, count, #0x000F0000    ;// partHeight-1
    157     AND     tmp3, count, #0x00F00000    ;// partWidth-1
    158     ADD     count, count, tmp1, LSL #8
    159 loop_y
    160     LDR     x_3_1, [ref, #-8]
    161     ADD     count, count, tmp3, LSL #8
    162     LDR     x_7_5, [ref, #-4]
    163     UXTB16  x_2_0, x_3_1
    164     UXTB16  x_3_1, x_3_1, ROR #8
    165     UXTB16  x_6_4, x_7_5
    166 
    167 loop_x
    168     UXTB16  x_7_5, x_7_5, ROR #8
    169 
    170     SMLAD   tmp1, x_2_0, mult_20_01, plus16
    171     SMLATB  tmp3, x_2_0, mult_20_01, plus16
    172     SMLATB  tmp2, x_2_0, mult_20_m5, plus16
    173     SMLATB  tmp4, x_3_1, mult_20_01, plus16
    174 
    175     SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
    176     SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    177     SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    178     LDR     x_3_1, [ref], #4
    179     SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
    180 
    181     SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    182     SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    183     SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    184     SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
    185 
    186     SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    187     UXTB16  x_2_0, x_3_1
    188     SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    189     SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    190     SMLABB  tmp4, x_2_0, mult_20_01, tmp4
    191 
    192     MOV     tmp2, tmp2, ASR #5
    193     MOV     tmp1, tmp1, ASR #5
    194     PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    195     PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    196     LDR     tmp4, [sp, #0x228]
    197     USAT16  tmp2, #8, tmp2
    198     USAT16  tmp1, #8, tmp1
    199     SUB     tmp4, tmp4, #10
    200 
    201     SUBS    count, count, #4<<28
    202     LDR     tmp3, [ref, tmp4]
    203     ORR     tmp1, tmp1, tmp2, LSL #8
    204 
    205 ;// quarter pel position
    206     LDR     tmp2, = 0x80808080
    207     MVN     tmp3, tmp3
    208     UHSUB8  tmp1, tmp1, tmp3
    209     EOR     tmp1, tmp1, tmp2
    210     STR     tmp1, [mb], #4
    211 
    212     BCC     next_y
    213 
    214     UXTB16  x_3_1, x_3_1, ROR #8
    215 
    216     SMLAD   tmp1, x_6_4, mult_20_01, plus16
    217     SMLATB  tmp3, x_6_4, mult_20_01, plus16
    218     SMLATB  tmp2, x_6_4, mult_20_m5, plus16
    219     SMLATB  tmp4, x_7_5, mult_20_01, plus16
    220 
    221     SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    222     SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    223     SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    224     LDR     x_7_5, [ref], #4
    225     SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
    226 
    227     SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    228     SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    229     SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    230     SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
    231 
    232     SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    233     UXTB16  x_6_4, x_7_5
    234     SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    235     SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    236     SMLABB  tmp4, x_6_4, mult_20_01, tmp4
    237 
    238     MOV     tmp2, tmp2, ASR #5
    239     MOV     tmp1, tmp1, ASR #5
    240     PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    241     PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    242     LDR     tmp4, [sp, #0x228]
    243     USAT16  tmp2, #8, tmp2
    244     USAT16  tmp1, #8, tmp1
    245     SUB     tmp4, tmp4, #10
    246 
    247     SUBS    count, count, #4<<28
    248     LDR     tmp3, [ref, tmp4]
    249     ORR     tmp1, tmp1, tmp2, LSL #8
    250 
    251 ;// quarter pel
    252     LDR     tmp2, = 0x80808080
    253     MVN     tmp3, tmp3
    254     UHSUB8  tmp1, tmp1, tmp3
    255     EOR     tmp1, tmp1, tmp2
    256 
    257     STR     tmp1, [mb], #4
    258     BCS     loop_x
    259 
    260 next_y
    261     AND     tmp3, count, #0x00F00000    ;// partWidth-1
    262     SMLABB  ref, count, mult_20_01, ref ;// +width
    263     ADDS    mb, mb, #16                 ;// +16, Carry=0
    264     SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
    265     SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
    266     ADDS    count, count, #(1<<28)-(1<<24)
    267     BGE     loop_y
    268 
    269     ADD     sp,sp,#0x1f4
    270     LDMFD   sp!, {r4-r11, pc}
    271 
    272     END
    273 
    274