Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer
     18 ;--            function
     19 ;--
     20 ;-------------------------------------------------------------------------------
     21 
     22 
     23     IF  :DEF: H264DEC_WINASM
     24         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     25     ELSE
     26         REQUIRE8
     27         PRESERVE8
     28     ENDIF
     29 
     30     AREA    |.text|, CODE
     31 
     32 
     33 ;// h264bsdInterpolateChromaHorVer register allocation
     34 
     35 ref     RN 0
     36 ptrA    RN 0
     37 
     38 mb      RN 1
     39 block   RN 1
     40 
     41 x0      RN 2
     42 count   RN 2
     43 
     44 y0      RN 3
     45 valY    RN 3
     46 
     47 width   RN 4
     48 
     49 tmp4    RN 5
     50 height  RN 5
     51 
     52 tmp1    RN 6
     53 
     54 tmp2    RN 7
     55 
     56 tmp3    RN 8
     57 
     58 valX    RN 9
     59 
     60 tmp5    RN 10
     61 chrPW   RN 10
     62 
     63 tmp6    RN 11
     64 chrPH   RN 11
     65 
     66 xFrac   RN 12
     67 
     68 c32     RN 14
     69 yFrac   RN 14
     70 
     71 ;// function exports and imports
     72 
     73     IMPORT  h264bsdFillBlock
     74 
     75     EXPORT  h264bsdInterpolateChromaHorVer
     76 
     77 ;//  Function arguments
     78 ;//
     79 ;//  u8 *ref,                   : 0xc4
     80 ;//  u8 *predPartChroma,        : 0xc8
     81 ;//  i32 x0,                    : 0xcc
     82 ;//  i32 y0,                    : 0xd0
     83 ;//  u32 width,                 : 0xf8
     84 ;//  u32 height,                : 0xfc
     85 ;//  u32 xFrac,                 : 0x100
     86 ;//  u32 yFrac,                 : 0x104
     87 ;//  u32 chromaPartWidth,       : 0x108
     88 ;//  u32 chromaPartHeight       : 0x10c
     89 
     90 h264bsdInterpolateChromaHorVer
     91     STMFD   sp!, {r0-r11,lr}
     92     SUB     sp, sp, #0xc4
     93 
     94     LDR     chrPW, [sp, #0x108]     ;// chromaPartWidth
     95     LDR     xFrac, [sp, #0x100]     ;// xFrac
     96     LDR     width, [sp, #0xf8]      ;// width
     97     CMP     x0, #0
     98     BLT     do_fill
     99 
    100     ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
    101     ADD     tmp1, tmp1, #1          ;// tmp1 = x0+ chromaPartWidth+1
    102     CMP     tmp1, width             ;// x0+chromaPartWidth+1 > width
    103     BHI     do_fill
    104 
    105     CMP     y0, #0
    106     BLT     do_fill
    107     LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
    108     LDR     height, [sp, #0xfc]     ;// height
    109     ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
    110     ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
    111     CMP     tmp1, height
    112     BLS     skip_fill
    113 
    114 do_fill
    115     LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
    116     LDR     height, [sp, #0xfc]     ;// height
    117     ADD     tmp3, chrPW, #1         ;// tmp3 = chromaPartWidth+1
    118     ADD     tmp1, chrPW, #1         ;// tmp1 = chromaPartWidth+1
    119     ADD     tmp2, chrPH, #1         ;// tmp2 = chromaPartHeight+1
    120     STMIA   sp,{width,height,tmp1,tmp2,tmp3}
    121     ADD     block, sp, #0x1c        ;// block
    122     BL      h264bsdFillBlock
    123 
    124     LDR     x0, [sp, #0xcc]
    125     LDR     y0, [sp, #0xd0]
    126     LDR     ref, [sp, #0xc4]        ;// ref
    127     STMIA   sp,{width,height,tmp1,tmp2,tmp3}
    128     ADD     block, sp, #0x1c        ;// block
    129     MLA     ref, height, width, ref ;// ref += width * height;
    130     MLA     block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1)
    131     BL      h264bsdFillBlock
    132 
    133     MOV     x0, #0                  ;// x0 = 0
    134     MOV     y0, #0                  ;// y0 = 0
    135     STR     x0, [sp, #0xcc]
    136     STR     y0, [sp, #0xd0]
    137     ADD     ref, sp, #0x1c          ;// ref = block
    138     STR     ref, [sp, #0xc4]        ;// ref
    139 
    140     STR     tmp2, [sp, #0xfc]       ;// height
    141     STR     tmp1, [sp, #0xf8]       ;// width
    142     MOV     width, tmp1
    143 
    144 skip_fill
    145     MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
    146     LDR     yFrac, [sp, #0x104]     ;// yFrac
    147     LDR     xFrac, [sp, #0x100]
    148     ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
    149     RSB     valX, xFrac, #8         ;// valX = 8-xFrac
    150     RSB     valY, yFrac, #8         ;// valY = 8-yFrac
    151 
    152     LDR     mb, [sp, #0xc8]         ;// predPartChroma
    153 
    154 
    155     ;// pack values to count register
    156     ;// [31:28] loop_x (chromaPartWidth-1)
    157     ;// [27:24] loop_y (chromaPartHeight-1)
    158     ;// [23:20] chromaPartWidth-1
    159     ;// [19:16] chromaPartHeight-1
    160     ;// [15:00] nothing
    161 
    162     SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
    163     SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
    164     ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
    165     ADD     count, count, tmp2, LSL #24 ;// loop_y
    166     ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
    167     AND     tmp2, count, #0x00F00000    ;// loop_x
    168     PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
    169     MOV     c32, #32
    170 
    171 
    172     ;///////////////////////////////////////////////////////////////////////////
    173     ;// Cb
    174     ;///////////////////////////////////////////////////////////////////////////
    175 
    176     ;// 2x2 pels per iteration
    177     ;// bilinear vertical and horizontal interpolation
    178 
    179 loop1_y
    180     LDRB    tmp1, [ptrA]
    181     LDRB    tmp3, [ptrA, width]
    182     LDRB    tmp5, [ptrA, width, LSL #1]
    183 
    184     PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
    185     PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
    186 
    187     SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
    188     SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
    189 
    190     ADD     count, count, tmp2, LSL #8
    191 loop1_x
    192     ;// first
    193     LDRB    tmp2, [ptrA, #1]!
    194     LDRB    tmp4, [ptrA, width]
    195     LDRB    tmp6, [ptrA, width, LSL #1]
    196 
    197     PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
    198     PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
    199 
    200     SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
    201     MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
    202     MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
    203 
    204     SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
    205     MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
    206     MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
    207 
    208     MOV     tmp6, tmp6, LSR #6          ;// scale down
    209     STRB    tmp6, [mb, #8]              ;// store pixel
    210     MOV     tmp5, tmp5, LSR #6          ;// scale down
    211     STRB    tmp5, [mb], #1              ;// store pixel
    212 
    213     ;// second
    214     LDRB    tmp1, [ptrA, #1]!
    215     LDRB    tmp3, [ptrA, width]
    216     LDRB    tmp5, [ptrA, width, LSL #1]
    217 
    218     PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
    219     PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
    220 
    221     SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
    222     MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
    223     MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
    224 
    225     SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
    226     MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
    227     MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
    228 
    229     MOV     tmp6, tmp6, LSR #6          ;// scale down
    230     STRB    tmp6, [mb, #8]              ;// store pixel
    231     MOV     tmp5, tmp5, LSR #6          ;// scale down
    232     STRB    tmp5, [mb], #1              ;// store pixel
    233 
    234     SUBS    count, count, #2<<28
    235     BCS     loop1_x
    236 
    237     AND     tmp2, count, #0x00F00000
    238 
    239     ADDS    mb, mb, #16
    240     SBC     mb, mb, tmp2, LSR #20
    241     ADD     ptrA, ptrA, width, LSL #1
    242     SBC     ptrA, ptrA, tmp2, LSR #20
    243 
    244     ADDS    count, count, #0xE << 24
    245     BGE     loop1_y
    246 
    247     ;///////////////////////////////////////////////////////////////////////////
    248     ;// Cr
    249     ;///////////////////////////////////////////////////////////////////////////
    250     LDR     height, [sp,#0xfc]          ;// height
    251     LDR     ref, [sp, #0xc4]            ;// ref
    252     LDR     tmp1, [sp, #0xd0]           ;// y0
    253     LDR     tmp2, [sp, #0xcc]           ;// x0
    254     LDR     mb, [sp, #0xc8]             ;// predPartChroma
    255 
    256     ADD     tmp1, height, tmp1
    257     MLA     tmp3, tmp1, width, tmp2
    258     ADD     ptrA, ref, tmp3
    259     ADD     mb, mb, #64
    260 
    261     AND     count, count, #0x00FFFFFF
    262     AND     tmp1, count, #0x000F0000
    263     ADD     count, count, tmp1, LSL #8
    264     AND     tmp2, count, #0x00F00000
    265 
    266     ;// 2x2 pels per iteration
    267     ;// bilinear vertical and horizontal interpolation
    268 loop2_y
    269     LDRB    tmp1, [ptrA]
    270     LDRB    tmp3, [ptrA, width]
    271     LDRB    tmp5, [ptrA, width, LSL #1]
    272 
    273     PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
    274     PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
    275 
    276     SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
    277     SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
    278 
    279     ADD     count, count, tmp2, LSL #8
    280 loop2_x
    281     ;// first
    282     LDRB    tmp2, [ptrA, #1]!
    283     LDRB    tmp4, [ptrA, width]
    284     LDRB    tmp6, [ptrA, width, LSL #1]
    285 
    286     PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
    287     PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
    288 
    289     SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
    290     MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
    291     MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
    292 
    293     SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
    294     MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
    295     MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
    296 
    297     MOV     tmp6, tmp6, LSR #6          ;// scale down
    298     STRB    tmp6, [mb, #8]              ;// store pixel
    299     MOV     tmp5, tmp5, LSR #6          ;// scale down
    300     STRB    tmp5, [mb], #1              ;// store pixel
    301 
    302     ;// second
    303     LDRB    tmp1, [ptrA, #1]!
    304     LDRB    tmp3, [ptrA, width]
    305     LDRB    tmp5, [ptrA, width, LSL #1]
    306 
    307     PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
    308     PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
    309 
    310     SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
    311     MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
    312     MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
    313 
    314     SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
    315     MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
    316     MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
    317 
    318     MOV     tmp6, tmp6, LSR #6          ;// scale down
    319     STRB    tmp6, [mb, #8]              ;// store pixel
    320     MOV     tmp5, tmp5, LSR #6          ;// scale down
    321     STRB    tmp5, [mb], #1              ;// store pixel
    322 
    323     SUBS    count, count, #2<<28
    324     BCS     loop2_x
    325 
    326     AND     tmp2, count, #0x00F00000
    327 
    328     ADDS    mb, mb, #16
    329     SBC     mb, mb, tmp2, LSR #20
    330     ADD     ptrA, ptrA, width, LSL #1
    331     SBC     ptrA, ptrA, tmp2, LSR #20
    332 
    333     ADDS    count, count, #0xE << 24
    334     BGE     loop2_y
    335 
    336     ADD     sp,sp,#0xd4
    337     LDMFD   sp!,{r4-r11,pc}
    338 
    339     END
    340