Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21 
     22     IF :DEF: H264DEC_WINASM
     23         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     24     ELSE
     25         REQUIRE8
     26         PRESERVE8
     27     ENDIF
     28 
     29     AREA    |.text|, CODE
     30 
     31 ;// h264bsdInterpolateChromaVer register allocation
     32 
     33 ref     RN 0
     34 ptrA    RN 0
     35 
     36 mb      RN 1
     37 block   RN 1
     38 
     39 x0      RN 2
     40 count   RN 2
     41 
     42 y0      RN 3
     43 valY    RN 3
     44 
     45 width   RN 4
     46 
     47 height  RN 5
     48 tmp7    RN 5
     49 
     50 chrPW   RN 6
     51 tmp8    RN 6
     52 
     53 tmp1    RN 7
     54 
     55 tmp2    RN 8
     56 
     57 tmp3    RN 9
     58 
     59 tmp4    RN 10
     60 
     61 tmp5    RN 11
     62 chrPH   RN 11
     63 
     64 tmp6    RN 12
     65 
     66 c32     RN 14
     67 yFrac   RN 14
     68 
     69 ;// Function exports and imports
     70 
     71     IMPORT  h264bsdFillBlock
     72 
     73     EXPORT  h264bsdInterpolateChromaVer
     74 
     75 ;//  Function arguments
     76 ;//
     77 ;//  u8 *ref,                   : 0xc4
     78 ;//  u8 *predPartChroma,        : 0xc8
     79 ;//  i32 x0,                    : 0xcc
     80 ;//  i32 y0,                    : 0xd0
     81 ;//  u32 width,                 : 0xf8
     82 ;//  u32 height,                : 0xfc
     83 ;//  u32 yFrac,                 : 0x100
     84 ;//  u32 chromaPartWidth,       : 0x104
     85 ;//  u32 chromaPartHeight       : 0x108
     86 
     87 h264bsdInterpolateChromaVer
     88     STMFD   sp!, {r0-r11,lr}
     89     SUB     sp, sp, #0xc4
     90 
     91     LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
     92     LDR     width, [sp, #0xf8]      ;// width
     93     CMP     x0, #0
     94     BLT     do_fill
     95 
     96     ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
     97     CMP     tmp1, width             ;// x0+chromaPartWidth > width
     98     BHI     do_fill
     99 
    100     CMP     y0, #0
    101     BLT     do_fill
    102     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
    103     LDR     height, [sp, #0xfc]     ;// height
    104     ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
    105     ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
    106     CMP     tmp1, height
    107     BLS     skip_fill
    108 
    109 do_fill
    110     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
    111     LDR     height, [sp, #0xfc]     ;// height
    112     ADD     tmp1, chrPH, #1         ;// tmp1 = chromaPartHeight+1
    113     MOV     tmp2, chrPW             ;// tmp2 = chromaPartWidth
    114     STMIA   sp,{width,height,chrPW,tmp1,tmp2}
    115     ADD     block, sp, #0x1c        ;// block
    116     BL      h264bsdFillBlock
    117 
    118     LDR     x0, [sp, #0xcc]
    119     LDR     y0, [sp, #0xd0]
    120     LDR     ref, [sp, #0xc4]        ;// ref
    121     STMIA   sp,{width,height,chrPW,tmp1,tmp2}
    122     ADD     block, sp, #0x1c        ;// block
    123     MLA     ref, height, width, ref ;// ref += width * height;
    124     MLA     block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
    125     BL      h264bsdFillBlock
    126 
    127     MOV     x0, #0                  ;// x0 = 0
    128     MOV     y0, #0                  ;// y0 = 0
    129     STR     x0, [sp, #0xcc]
    130     STR     y0, [sp, #0xd0]
    131     ADD     ref, sp, #0x1c          ;// ref = block
    132     STR     ref, [sp, #0xc4]        ;// ref
    133 
    134     STR     tmp1, [sp, #0xfc]       ;// height
    135     STR     chrPW, [sp, #0xf8]      ;// width
    136     MOV     width, chrPW
    137 
    138 skip_fill
    139     MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
    140     LDR     yFrac, [sp, #0x100]     ;// yFrac
    141     ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
    142     RSB     valY, yFrac, #8         ;// valY = 8-yFrac
    143 
    144     LDR     mb, [sp, #0xc8]         ;// predPartChroma
    145 
    146 
    147     ;// pack values to count register
    148     ;// [31:28] loop_x (chromaPartWidth-1)
    149     ;// [27:24] loop_y (chromaPartHeight-1)
    150     ;// [23:20] chromaPartWidth-1
    151     ;// [19:16] chromaPartHeight-1
    152     ;// [15:00] nothing
    153 
    154     SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
    155     SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
    156     ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
    157     ADD     count, count, tmp2, LSL #24 ;// loop_y
    158     ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
    159     AND     tmp2, count, #0x00F00000    ;// loop_x
    160     PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
    161     MOV     valY, valY, LSL #3          ;// multiply by 8 in advance
    162     MOV     c32, #32
    163 
    164 
    165     ;///////////////////////////////////////////////////////////////////////////
    166     ;// Cb
    167     ;///////////////////////////////////////////////////////////////////////////
    168 
    169     ;// 2x2 pels per iteration
    170     ;// bilinear vertical interpolation
    171 
    172 loop1_y
    173     ADD     count, count, tmp2, LSL #8
    174 loop1_x
    175     ;// Process 2x2 block
    176     LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
    177     LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
    178     LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
    179 
    180     LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
    181     LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
    182     LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
    183 
    184     PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
    185     PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
    186     PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
    187 
    188     SMLAD   tmp7, tmp2, valY, c32       ;// multiply
    189     PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
    190     SMLAD   tmp2, tmp1, valY, c32       ;// multiply
    191     SMLAD   tmp8, tmp5, valY, c32       ;// multiply
    192     SMLAD   tmp5, tmp4, valY, c32       ;// multiply
    193 
    194     MOV     tmp7, tmp7, LSR #6          ;// scale down
    195     STRB    tmp7, [mb,#8]               ;// store row 2 col 1
    196     MOV     tmp2, tmp2, LSR #6          ;// scale down
    197     STRB    tmp2, [mb],#1               ;// store row 1 col 1
    198 
    199     MOV     tmp8, tmp8, LSR #6          ;// scale down
    200     STRB    tmp8, [mb,#8]               ;// store row 2 col 2
    201     MOV     tmp5, tmp5, LSR #6          ;// scale down
    202     STRB    tmp5, [mb],#1               ;// store row 1 col 2
    203 
    204 
    205     SUBS    count, count, #2<<28
    206     BCS     loop1_x
    207 
    208     AND     tmp2, count, #0x00F00000
    209 
    210     ADDS    mb, mb, #16
    211     SBC     mb, mb, tmp2, LSR #20
    212     ADD     ptrA, ptrA, width, LSL #1
    213     SBC     ptrA, ptrA, tmp2, LSR #20
    214 
    215     ADDS    count, count, #0xE << 24
    216     BGE     loop1_y
    217 
    218     ;///////////////////////////////////////////////////////////////////////////
    219     ;// Cr
    220     ;///////////////////////////////////////////////////////////////////////////
    221     LDR     height, [sp,#0xfc]          ;// height
    222     LDR     ref, [sp, #0xc4]            ;// ref
    223     LDR     tmp1, [sp, #0xd0]           ;// y0
    224     LDR     tmp2, [sp, #0xcc]           ;// x0
    225     LDR     mb, [sp, #0xc8]             ;// predPartChroma
    226 
    227     ADD     tmp1, height, tmp1
    228     MLA     tmp3, tmp1, width, tmp2
    229     ADD     ptrA, ref, tmp3
    230     ADD     mb, mb, #64
    231 
    232     AND     count, count, #0x00FFFFFF
    233     AND     tmp1, count, #0x000F0000
    234     ADD     count, count, tmp1, LSL #8
    235     AND     tmp2, count, #0x00F00000
    236 
    237     ;// 2x2 pels per iteration
    238     ;// bilinear vertical interpolation
    239 loop2_y
    240     ADD     count, count, tmp2, LSL #8
    241 loop2_x
    242     ;// Process 2x2 block
    243     LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
    244     LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
    245     LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
    246 
    247     LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
    248     LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
    249     LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
    250 
    251     PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
    252     PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
    253     PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
    254 
    255     SMLAD   tmp7, tmp2, valY, c32       ;// multiply
    256     PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
    257     SMLAD   tmp2, tmp1, valY, c32       ;// multiply
    258     SMLAD   tmp8, tmp5, valY, c32       ;// multiply
    259     SMLAD   tmp5, tmp4, valY, c32       ;// multiply
    260 
    261     MOV     tmp7, tmp7, LSR #6          ;// scale down
    262     STRB    tmp7, [mb,#8]               ;// store row 2 col 1
    263     MOV     tmp2, tmp2, LSR #6          ;// scale down
    264     STRB    tmp2, [mb],#1               ;// store row 1 col 1
    265 
    266     MOV     tmp8, tmp8, LSR #6          ;// scale down
    267     STRB    tmp8, [mb,#8]               ;// store row 2 col 2
    268     MOV     tmp5, tmp5, LSR #6          ;// scale down
    269     STRB    tmp5, [mb],#1               ;// store row 1 col 2
    270 
    271 
    272     SUBS    count, count, #2<<28
    273     BCS     loop2_x
    274 
    275     AND     tmp2, count, #0x00F00000
    276 
    277     ADDS    mb, mb, #16
    278     SBC     mb, mb, tmp2, LSR #20
    279     ADD     ptrA, ptrA, width, LSL #1
    280     SBC     ptrA, ptrA, tmp2, LSR #20
    281 
    282     ADDS    count, count, #0xE << 24
    283     BGE     loop2_y
    284 
    285     ADD     sp,sp,#0xd4
    286     LDMFD   sp!, {r4-r11,pc}
    287 
    288     END
    289