Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version horizontal part of
     18 ;--            h264bsdInterpolateMid functions
     19 ;--
     20 ;-------------------------------------------------------------------------------
     21 
     22 
     23     IF :DEF: H264DEC_WINASM
     24         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     25     ELSE
     26         REQUIRE8
     27         PRESERVE8
     28     ENDIF
     29 
     30     AREA    |.text|, CODE
     31 
     32 
     33 ;// Register allocation
     34 
     35 ref     RN 0    ;// pointer to current position in reference image
     36 mb      RN 1    ;// pointer to current position in interpolated mb
     37 count   RN 2    ;// bit-packed width and count values
     38 
     39 x_2_0   RN 4
     40 x_3_1   RN 5
     41 x_6_4   RN 6
     42 x_7_5   RN 7
     43 
     44 tmp1    RN 8
     45 tmp2    RN 9
     46 tmp3    RN 10
     47 tmp4    RN 11
     48 
     49 mult_20_01  RN 12   ;// [20,  1]
     50 mult_20_m5  RN 14   ;// [20, -5]
     51 
     52 
     53         EXPORT  h264bsdInterpolateMidHorPart
     54 
     55 ;// Horizontal filter approach
     56 ;//
     57 ;// Basic idea in horizontal filtering is to adjust coefficients
     58 ;// like below. Calculation is done with 16-bit maths.
     59 ;//
     60 ;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
     61 ;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
     62 ;// y_0 =   20  1     20 -5        -5         1
     63 ;// y_1 =   -5        20  1      1 20        -5
     64 ;// y_2 =    1        -5        -5 20      1 20
     65 ;// y_3 =              1        20 -5     -5 20         1
     66 
     67 
     68 h264bsdInterpolateMidHorPart
     69     STMFD   sp!, {r4-r11, lr}
     70 
     71     ;// pack values to count register
     72     ;// [31:28] loop_x (partWidth-1)
     73     ;// [27:24] loop_y (partHeight-1)
     74     ;// [23:20] partWidth-1
     75     ;// [19:16] partHeight-1
     76     ;// [15:00] width
     77 
     78 
     79     LDR     mult_20_01, = 0x00140001
     80     LDR     mult_20_m5, = 0x0014FFFB
     81     AND     tmp3, count, #0x000F0000    ;// partWidth-1
     82 loop_y
     83     LDR     x_3_1, [ref, #-8]
     84     ADD     count, count, tmp3, LSL #12
     85     LDR     x_7_5, [ref, #-4]
     86     UXTB16  x_2_0, x_3_1
     87     UXTB16  x_3_1, x_3_1, ROR #8
     88     UXTB16  x_6_4, x_7_5
     89 
     90 loop_x
     91     UXTB16  x_7_5, x_7_5, ROR #8
     92 
     93     SMUAD   tmp1, x_2_0, mult_20_01
     94     SMULTB  tmp2, x_2_0, mult_20_m5
     95     SMULTB  tmp3, x_2_0, mult_20_01
     96     SMULTB  tmp4, x_3_1, mult_20_01
     97 
     98     SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
     99     SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    100     SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    101     LDR     x_3_1, [ref], #4
    102     SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
    103 
    104     SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    105     SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    106     SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    107     SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
    108 
    109     SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    110     SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    111     UXTB16  x_2_0, x_3_1
    112     SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    113     SMLABB  tmp4, x_2_0, mult_20_01, tmp4
    114 
    115     SUBS    count, count, #4<<28
    116     STR     tmp1, [mb], #4
    117     STR     tmp2, [mb], #4
    118     STR     tmp3, [mb], #4
    119     STR     tmp4, [mb], #4
    120     BCC     next_y
    121 
    122     UXTB16  x_3_1, x_3_1, ROR #8
    123 
    124     SMUAD   tmp1, x_6_4, mult_20_01
    125     SMULTB  tmp2, x_6_4, mult_20_m5
    126     SMULTB  tmp3, x_6_4, mult_20_01
    127     SMULTB  tmp4, x_7_5, mult_20_01
    128 
    129     SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    130     SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    131     SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    132     LDR     x_7_5, [ref], #4
    133     SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
    134 
    135     SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    136     SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    137     SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    138     SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
    139 
    140     SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    141     SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    142     UXTB16  x_6_4, x_7_5
    143     SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    144     SMLABB  tmp4, x_6_4, mult_20_01, tmp4
    145 
    146     SUBS    count, count, #4<<28
    147     STR     tmp1, [mb], #4
    148     STR     tmp2, [mb], #4
    149     STR     tmp3, [mb], #4
    150     STR     tmp4, [mb], #4
    151     BCS     loop_x
    152 
    153 next_y
    154     AND     tmp3, count, #0x000F0000    ;// partWidth-1
    155     SMLABB  ref, count, mult_20_01, ref   ;// +width
    156     SBC     ref, ref, tmp3, LSR #16   ;// -(partWidth-1)-1
    157     ADDS    count, count, #(1<<28)-(1<<20)
    158     BGE     loop_y
    159 
    160     LDMFD   sp!, {r4-r11, pc}
    161 
    162     END
    163 
    164