Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
     18 ;--
     19 ;-------------------------------------------------------------------------------
     20 
     21 
     22     IF  :DEF: H264DEC_WINASM
     23         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     24     ELSE
     25         REQUIRE8
     26         PRESERVE8
     27     ENDIF
     28 
     29     AREA    |.text|, CODE
     30 
     31 
     32 ;// h264bsdInterpolateChromaHor register allocation
     33 
     34 ref     RN 0
     35 ptrA    RN 0
     36 
     37 mb      RN 1
     38 block   RN 1
     39 
     40 x0      RN 2
     41 count   RN 2
     42 
     43 y0      RN 3
     44 valX    RN 3
     45 
     46 width   RN 4
     47 
     48 height  RN 5
     49 tmp7    RN 5
     50 
     51 chrPW   RN 6
     52 tmp8    RN 6
     53 
     54 tmp1    RN 7
     55 chrPH   RN 7
     56 
     57 tmp2    RN 8
     58 
     59 tmp3    RN 9
     60 
     61 tmp4    RN 10
     62 
     63 tmp5    RN 11
     64 
     65 tmp6    RN 12
     66 
     67 c32     RN 14
     68 xFrac   RN 14
     69 
     70 ;// Function exports and imports
     71 
     72     IMPORT  h264bsdFillBlock
     73 
     74     EXPORT  h264bsdInterpolateChromaHor
     75 
     76 ;//  Function arguments
     77 ;//
     78 ;//  u8 *ref,                   : 0xc4
     79 ;//  u8 *predPartChroma,        : 0xc8
     80 ;//  i32 x0,                    : 0xcc
     81 ;//  i32 y0,                    : 0xd0
     82 ;//  u32 width,                 : 0xf8
     83 ;//  u32 height,                : 0xfc
     84 ;//  u32 xFrac,                 : 0x100
     85 ;//  u32 chromaPartWidth,       : 0x104
     86 ;//  u32 chromaPartHeight       : 0x108
     87 
     88 h264bsdInterpolateChromaHor
     89     STMFD   sp!, {r0-r11,lr}
     90     SUB     sp, sp, #0xc4
     91 
     92     LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
     93     LDR     width, [sp, #0xf8]      ;// width
     94     CMP     x0, #0
     95     BLT     do_fill
     96 
     97     ADD     tmp6, x0, chrPW         ;// tmp6 = x0+ chromaPartWidth
     98     ADD     tmp6, tmp6, #1          ;// tmp6 = x0 + chromaPartWidth + 1
     99     CMP     tmp6, width             ;// x0+chromaPartWidth+1 > width
    100     BHI     do_fill
    101 
    102     CMP     y0, #0
    103     BLT     do_fill
    104     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
    105     LDR     height, [sp, #0xfc]     ;// height
    106     ADD     tmp6, y0, chrPH         ;// tmp6 = y0 + chromaPartHeight
    107     CMP     tmp6, height
    108     BLS     skip_fill
    109 
    110 do_fill
    111     LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
    112     LDR     height, [sp, #0xfc]     ;// height
    113     ADD     tmp8, chrPW, #1         ;// tmp8 = chromaPartWidth+1
    114     MOV     tmp2, tmp8              ;// tmp2 = chromaPartWidth+1
    115     STMIA   sp,{width,height,tmp8,chrPH,tmp2}
    116     ADD     block, sp, #0x1c        ;// block
    117     BL      h264bsdFillBlock
    118 
    119     LDR     x0, [sp, #0xcc]
    120     LDR     y0, [sp, #0xd0]
    121     LDR     ref, [sp, #0xc4]        ;// ref
    122     STMIA   sp,{width,height,tmp8,chrPH,tmp2}
    123     ADD     block, sp, #0x1c        ;// block
    124     MLA     ref, height, width, ref ;// ref += width * height;
    125     MLA     block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
    126     BL      h264bsdFillBlock
    127 
    128     MOV     x0, #0                  ;// x0 = 0
    129     MOV     y0, #0                  ;// y0 = 0
    130     STR     x0, [sp, #0xcc]
    131     STR     y0, [sp, #0xd0]
    132     ADD     ref, sp, #0x1c          ;// ref = block
    133     STR     ref, [sp, #0xc4]        ;// ref
    134 
    135     STR     chrPH, [sp, #0xfc]      ;// height
    136     STR     tmp8, [sp, #0xf8]       ;// width
    137     MOV     width, tmp8
    138     SUB     chrPW, chrPW, #1
    139 
    140 skip_fill
    141     MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
    142     LDR     xFrac, [sp, #0x100]     ;// xFrac
    143     ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
    144     RSB     valX, xFrac, #8         ;// valX = 8-xFrac
    145 
    146     LDR     mb, [sp, #0xc8]         ;// predPartChroma
    147 
    148 
    149     ;// pack values to count register
    150     ;// [31:28] loop_x (chromaPartWidth-1)
    151     ;// [27:24] loop_y (chromaPartHeight-1)
    152     ;// [23:20] chromaPartWidth-1
    153     ;// [19:16] chromaPartHeight-1
    154     ;// [15:00] nothing
    155 
    156     SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
    157     SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
    158     ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
    159     ADD     count, count, tmp2, LSL #24 ;// loop_y
    160     ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
    161     AND     tmp2, count, #0x00F00000    ;// loop_x
    162     PKHBT   valX, valX, xFrac, LSL #16  ;// |xFrac|valX |
    163     MOV     valX, valX, LSL #3          ;// multiply by 8 in advance
    164     MOV     c32, #32
    165 
    166 
    167     ;///////////////////////////////////////////////////////////////////////////
    168     ;// Cb
    169     ;///////////////////////////////////////////////////////////////////////////
    170 
    171     ;// 2x2 pels per iteration
    172     ;// bilinear vertical interpolation
    173 
    174 loop1_y
    175     ADD     count, count, tmp2, LSL #8
    176     LDRB    tmp1, [ptrA, width]
    177     LDRB    tmp2, [ptrA], #1
    178 
    179 loop1_x
    180     LDRB    tmp3, [ptrA, width]
    181     LDRB    tmp4, [ptrA], #1
    182 
    183     PKHBT   tmp5, tmp1, tmp3, LSL #16
    184     PKHBT   tmp6, tmp2, tmp4, LSL #16
    185 
    186     LDRB    tmp1, [ptrA, width]
    187     LDRB    tmp2, [ptrA], #1
    188 
    189     SMLAD   tmp5, tmp5, valX, c32       ;// multiply
    190     SMLAD   tmp6, tmp6, valX, c32       ;// multiply
    191 
    192     PKHBT   tmp7, tmp3, tmp1, LSL #16
    193     PKHBT   tmp8, tmp4, tmp2, LSL #16
    194 
    195     SMLAD   tmp7, tmp7, valX, c32       ;// multiply
    196     SMLAD   tmp8, tmp8, valX, c32       ;// multiply
    197 
    198     MOV     tmp5, tmp5, LSR #6          ;// scale down
    199     STRB    tmp5, [mb,#8]               ;// store row 2 col 1
    200 
    201     MOV     tmp6, tmp6, LSR #6          ;// scale down
    202     STRB    tmp6, [mb],#1               ;// store row 1 col 1
    203 
    204     MOV     tmp7, tmp7, LSR #6          ;// scale down
    205     STRB    tmp7, [mb,#8]               ;// store row 2 col 2
    206 
    207     MOV     tmp8, tmp8, LSR #6          ;// scale down
    208     STRB    tmp8, [mb],#1               ;// store row 1 col 2
    209 
    210     SUBS    count, count, #2<<28
    211     BCS     loop1_x
    212 
    213     AND     tmp2, count, #0x00F00000
    214 
    215     ADDS    mb, mb, #16
    216     SBC     mb, mb, tmp2, LSR #20
    217     ADD     ptrA, ptrA, width, LSL #1
    218     SBC     ptrA, ptrA, tmp2, LSR #20
    219     SUB     ptrA, ptrA, #1
    220 
    221     ADDS    count, count, #0xE << 24
    222     BGE     loop1_y
    223 
    224     ;///////////////////////////////////////////////////////////////////////////
    225     ;// Cr
    226     ;///////////////////////////////////////////////////////////////////////////
    227     LDR     height, [sp,#0xfc]          ;// height
    228     LDR     ref, [sp, #0xc4]            ;// ref
    229     LDR     tmp1, [sp, #0xd0]           ;// y0
    230     LDR     tmp2, [sp, #0xcc]           ;// x0
    231     LDR     mb, [sp, #0xc8]             ;// predPartChroma
    232 
    233     ADD     tmp1, height, tmp1
    234     MLA     tmp3, tmp1, width, tmp2
    235     ADD     ptrA, ref, tmp3
    236     ADD     mb, mb, #64
    237 
    238     AND     count, count, #0x00FFFFFF
    239     AND     tmp1, count, #0x000F0000
    240     ADD     count, count, tmp1, LSL #8
    241     AND     tmp2, count, #0x00F00000
    242 
    243     ;// 2x2 pels per iteration
    244     ;// bilinear vertical interpolation
    245 loop2_y
    246     ADD     count, count, tmp2, LSL #8
    247     LDRB    tmp1, [ptrA, width]
    248     LDRB    tmp2, [ptrA], #1
    249 
    250 loop2_x
    251     LDRB    tmp3, [ptrA, width]
    252     LDRB    tmp4, [ptrA], #1
    253 
    254     PKHBT   tmp5, tmp1, tmp3, LSL #16
    255     PKHBT   tmp6, tmp2, tmp4, LSL #16
    256 
    257     LDRB    tmp1, [ptrA, width]
    258     LDRB    tmp2, [ptrA], #1
    259 
    260     SMLAD   tmp5, tmp5, valX, c32       ;// multiply
    261     SMLAD   tmp6, tmp6, valX, c32       ;// multiply
    262 
    263     PKHBT   tmp7, tmp3, tmp1, LSL #16
    264     PKHBT   tmp8, tmp4, tmp2, LSL #16
    265 
    266     SMLAD   tmp7, tmp7, valX, c32       ;// multiply
    267     SMLAD   tmp8, tmp8, valX, c32       ;// multiply
    268 
    269     MOV     tmp5, tmp5, LSR #6          ;// scale down
    270     STRB    tmp5, [mb,#8]               ;// store row 2 col 1
    271 
    272     MOV     tmp6, tmp6, LSR #6          ;// scale down
    273     STRB    tmp6, [mb],#1               ;// store row 1 col 1
    274 
    275     MOV     tmp7, tmp7, LSR #6          ;// scale down
    276     STRB    tmp7, [mb,#8]               ;// store row 2 col 2
    277 
    278     MOV     tmp8, tmp8, LSR #6          ;// scale down
    279     STRB    tmp8, [mb],#1               ;// store row 1 col 2
    280 
    281     SUBS    count, count, #2<<28
    282     BCS     loop2_x
    283 
    284     AND     tmp2, count, #0x00F00000
    285 
    286     ADDS    mb, mb, #16
    287     SBC     mb, mb, tmp2, LSR #20
    288     ADD     ptrA, ptrA, width, LSL #1
    289     SBC     ptrA, ptrA, tmp2, LSR #20
    290     SUB     ptrA, ptrA, #1
    291 
    292     ADDS    count, count, #0xE << 24
    293     BGE     loop2_y
    294 
    295     ADD     sp,sp,#0xd4
    296     LDMFD   sp!, {r4-r11,pc}
    297 
    298     END
    299