Home | History | Annotate | Download | only in arm11_asm
      1 ; Copyright (C) 2009 The Android Open Source Project
      2 ;
      3 ; Licensed under the Apache License, Version 2.0 (the "License");
      4 ; you may not use this file except in compliance with the License.
      5 ; You may obtain a copy of the License at
      6 ;
      7 ;      http://www.apache.org/licenses/LICENSE-2.0
      8 ;
      9 ; Unless required by applicable law or agreed to in writing, software
     10 ; distributed under the License is distributed on an "AS IS" BASIS,
     11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 ; See the License for the specific language governing permissions and
     13 ; limitations under the License.
     14 
     15 ;-------------------------------------------------------------------------------
     16 ;--
     17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
     18 ;--            function
     19 ;--
     20 ;-------------------------------------------------------------------------------
     21 
     22 
     23     IF :DEF: H264DEC_WINASM
     24         ;// We dont use REQUIRE8 and PRESERVE8 for winasm
     25     ELSE
     26         REQUIRE8
     27         PRESERVE8
     28     ENDIF
     29 
     30     AREA    |.text|, CODE
     31 
     32 ;// h264bsdInterpolateHorVerQuarter register allocation
     33 
     34 ref     RN 0
     35 
     36 mb      RN 1
     37 buff    RN 1
     38 
     39 count   RN 2
     40 x0      RN 2
     41 
     42 y0      RN 3
     43 x_2_0   RN 3
     44 res     RN 3
     45 
     46 x_3_1   RN 4
     47 tmp1    RN 4
     48 
     49 height  RN 5
     50 x_6_4   RN 5
     51 tmp2    RN 5
     52 
     53 partW   RN 6
     54 x_7_5   RN 6
     55 tmp3    RN 6
     56 
     57 partH   RN 7
     58 tmp4    RN 7
     59 
     60 tmp5    RN 8
     61 
     62 tmp6    RN 9
     63 
     64 tmpa    RN 10
     65 
     66 mult_20_01  RN 11
     67 tmpb        RN 11
     68 
     69 mult_20_m5  RN 12
     70 width       RN 12
     71 
     72 plus16  RN 14
     73 
     74 
     75 ;// function exports and imports
     76 
     77     IMPORT  h264bsdFillBlock
     78 
     79     EXPORT  h264bsdInterpolateHorVerQuarter
     80 
     81 ;// Horizontal filter approach
     82 ;//
     83 ;// Basic idea in horizontal filtering is to adjust coefficients
     84 ;// like below. Calculation is done with 16-bit maths.
     85 ;//
     86 ;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
     87 ;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
     88 ;// y_0 =   20  1     20 -5        -5         1
     89 ;// y_1 =   -5        20  1      1 20        -5
     90 ;// y_2 =    1        -5        -5 20      1 20
     91 ;// y_3 =              1        20 -5     -5 20         1
     92 
     93 
     94 h264bsdInterpolateHorVerQuarter
     95     STMFD   sp!, {r0-r11, lr}
     96     SUB     sp, sp, #0x1e4
     97 
     98     CMP     x0, #0
     99     BLT     do_fill                 ;// (x0 < 0)
    100     LDR     partW, [sp,#0x220]      ;// partWidth
    101     LDR     width, [sp,#0x218]      ;// width
    102     ADD     tmpa, x0, partW         ;// (x0+partWidth)
    103     ADD     tmpa, tmpa, #5          ;// (x0+partW+5)
    104     CMP     tmpa, width
    105     BHI     do_fill                 ;// (x0+partW)>width
    106 
    107     CMP     y0, #0
    108     BLT     do_fill                 ;// (y0 < 0)
    109     LDR     partH, [sp,#0x224]      ;// partHeight
    110     LDR     height, [sp,#0x21c]     ;// height
    111     ADD     tmp5, y0, partH         ;// (y0+partHeight)
    112     ADD     tmp5, tmp5, #5          ;// (y0+partH+5)
    113     CMP     tmp5, height
    114     BLS     skip_fill               ;// no overfill needed
    115 
    116 
    117 do_fill
    118     LDR     partH, [sp,#0x224]      ;// partHeight
    119     LDR     partW, [sp,#0x220]      ;// partWidth
    120     LDR     height, [sp,#0x21c]     ;// height
    121     ADD     tmp5, partH, #5         ;// tmp5 = partH + 5
    122     ADD     tmpa, partW, #5         ;// tmpa = partW + 5
    123     STMIB   sp, {height, tmpa}      ;// sp+4 = height, sp+8 = partWidth+5
    124     LDR     width, [sp,#0x218]      ;// width
    125     STR     tmp5, [sp,#0xc]         ;// sp+c = partHeight+5
    126     STR     tmpa, [sp,#0x10]        ;// sp+10 = partWidth+5
    127     STR     width, [sp,#0]          ;// sp+0 = width
    128     ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    129     BL      h264bsdFillBlock
    130 
    131     MOV     x0, #0
    132     STR     x0,[sp,#0x1ec]          ;// x0 = 0
    133     STR     x0,[sp,#0x1f0]          ;// y0 = 0
    134     ADD     ref,sp,#0x28            ;// ref = p1
    135     STR     tmpa, [sp,#0x218]       ;// width = partWidth+5
    136 
    137 
    138 skip_fill
    139     LDR     x0 ,[sp,#0x1ec]         ;// x0
    140     LDR     y0 ,[sp,#0x1f0]         ;// y0
    141     LDR     width, [sp,#0x218]      ;// width
    142     LDR     tmp6, [sp,#0x228]       ;// horVerOffset
    143     LDR     mb, [sp, #0x1e8]        ;// mb
    144     MLA     tmp5, width, y0, x0     ;// y0*width+x0
    145     ADD     ref, ref, tmp5          ;// ref += y0*width+x0
    146     STR     ref, [sp, #0x1e4]       ;// store "ref" for vertical filtering
    147     AND     tmp6, tmp6, #2          ;// calculate ref for horizontal filter
    148     MOV     tmpa, #2
    149     ADD     tmp6, tmpa, tmp6, LSR #1
    150     MLA     ref, tmp6, width, ref
    151     ADD     ref, ref, #8            ;// ref = ref+8
    152 
    153     ;// pack values to count register
    154     ;// [31:28] loop_x (partWidth-1)
    155     ;// [27:24] loop_y (partHeight-1)
    156     ;// [23:20] partWidth-1
    157     ;// [19:16] partHeight-1
    158     ;// [15:00] width
    159     MOV     count, width
    160     SUB     partW, partW, #1;
    161     SUB     partH, partH, #1;
    162     ADD     tmp5, partH, partW, LSL #4
    163     ADD     count, count, tmp5, LSL #16
    164 
    165 
    166     LDR     mult_20_01, = 0x00140001    ;// constant multipliers
    167     LDR     mult_20_m5, = 0x0014FFFB    ;// constant multipliers
    168     MOV     plus16, #16                 ;// constant for add
    169     AND     tmp4, count, #0x000F0000    ;// partHeight-1
    170     AND     tmp6, count, #0x00F00000    ;// partWidth-1
    171     ADD     count, count, tmp4, LSL #8  ;// partH-1 to lower part of top byte
    172 
    173 ;// HORIZONTAL PART
    174 
    175 loop_y_hor
    176     LDR     x_3_1, [ref, #-8]
    177     ADD     count, count, tmp6, LSL #8   ;// partW-1 to upper part of top byte
    178     LDR     x_7_5, [ref, #-4]
    179     UXTB16  x_2_0, x_3_1
    180     UXTB16  x_3_1, x_3_1, ROR #8
    181     UXTB16  x_6_4, x_7_5
    182 
    183 loop_x_hor
    184     UXTB16  x_7_5, x_7_5, ROR #8
    185 
    186     SMLAD   tmp4, x_2_0, mult_20_01, plus16
    187     SMLATB  tmp6, x_2_0, mult_20_01, plus16
    188     SMLATB  tmp5, x_2_0, mult_20_m5, plus16
    189     SMLATB  tmpa, x_3_1, mult_20_01, plus16
    190 
    191     SMLAD   tmp4, x_3_1, mult_20_m5, tmp4
    192     SMLATB  tmp6, x_3_1, mult_20_m5, tmp6
    193     SMLAD   tmp5, x_3_1, mult_20_01, tmp5
    194     LDR     x_3_1, [ref], #4
    195     SMLAD   tmpa, x_6_4, mult_20_m5, tmpa
    196 
    197     SMLABB  tmp4, x_6_4, mult_20_m5, tmp4
    198     SMLADX  tmp6, x_6_4, mult_20_m5, tmp6
    199     SMLADX  tmp5, x_6_4, mult_20_01, tmp5
    200     SMLADX  tmpa, x_7_5, mult_20_m5, tmpa
    201 
    202     SMLABB  tmp4, x_7_5, mult_20_01, tmp4
    203     UXTB16  x_2_0, x_3_1
    204     SMLABB  tmp5, x_7_5, mult_20_m5, tmp5
    205     SMLADX  tmp6, x_7_5, mult_20_01, tmp6
    206     SMLABB  tmpa, x_2_0, mult_20_01, tmpa
    207 
    208     MOV     tmp5, tmp5, ASR #5
    209     MOV     tmp4, tmp4, ASR #5
    210     PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
    211     PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
    212     USAT16  tmp5, #8, tmp5
    213     USAT16  tmp4, #8, tmp4
    214 
    215     SUBS    count, count, #4<<28
    216     ORR     tmp4, tmp4, tmp5, LSL #8
    217     STR     tmp4, [mb], #4
    218     BCC     next_y_hor
    219 
    220     UXTB16  x_3_1, x_3_1, ROR #8
    221 
    222     SMLAD   tmp4, x_6_4, mult_20_01, plus16
    223     SMLATB  tmp6, x_6_4, mult_20_01, plus16
    224     SMLATB  tmp5, x_6_4, mult_20_m5, plus16
    225     SMLATB  tmpa, x_7_5, mult_20_01, plus16
    226 
    227     SMLAD   tmp4, x_7_5, mult_20_m5, tmp4
    228     SMLATB  tmp6, x_7_5, mult_20_m5, tmp6
    229     SMLAD   tmp5, x_7_5, mult_20_01, tmp5
    230     LDR     x_7_5, [ref], #4
    231     SMLAD   tmpa, x_2_0, mult_20_m5, tmpa
    232 
    233     SMLABB  tmp4, x_2_0, mult_20_m5, tmp4
    234     SMLADX  tmp6, x_2_0, mult_20_m5, tmp6
    235     SMLADX  tmp5, x_2_0, mult_20_01, tmp5
    236     SMLADX  tmpa, x_3_1, mult_20_m5, tmpa
    237 
    238     SMLABB  tmp4, x_3_1, mult_20_01, tmp4
    239     UXTB16  x_6_4, x_7_5
    240     SMLABB  tmp5, x_3_1, mult_20_m5, tmp5
    241     SMLADX  tmp6, x_3_1, mult_20_01, tmp6
    242     SMLABB  tmpa, x_6_4, mult_20_01, tmpa
    243 
    244     MOV     tmp5, tmp5, ASR #5
    245     MOV     tmp4, tmp4, ASR #5
    246     PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
    247     PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
    248     USAT16  tmp5, #8, tmp5
    249     USAT16  tmp4, #8, tmp4
    250 
    251     SUBS    count, count, #4<<28
    252     ORR     tmp4, tmp4, tmp5, LSL #8
    253     STR     tmp4, [mb], #4
    254     BCS     loop_x_hor
    255 
    256 next_y_hor
    257     AND     tmp6, count, #0x00F00000        ;// partWidth-1
    258     SMLABB  ref, count, mult_20_01, ref     ;// +width
    259     ADDS    mb, mb, #16                     ;// +16, Carry=0
    260     SBC     mb, mb, tmp6, LSR #20           ;// -(partWidth-1)-1
    261     SBC     ref, ref, tmp6, LSR #20         ;// -(partWidth-1)-1
    262     ADDS    count, count, #(1<<28)-(1<<24)  ;// decrement counter (partW)
    263     BGE     loop_y_hor
    264 
    265 
    266 
    267 ;// VERTICAL PART
    268 ;//
    269 ;// Approach to vertical interpolation
    270 ;//
    271 ;// Interpolation is done by using 32-bit loads and stores
    272 ;// and by using 16 bit arithmetic. 4x4 block is processed
    273 ;// in each round.
    274 ;//
    275 ;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
    276 ;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
    277 ;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
    278 ;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
    279 ;//           ..
    280 ;//           ..
    281 ;// |a_m1|a_m1|a_m1|a_m1|...
    282 ;// |b_m1|b_m1|b_m1|b_m1|...
    283 ;// |c_m1|c_m1|c_m1|c_m1|...
    284 ;// |d_m1|d_m1|d_m1|d_m1|...
    285 
    286 ;// Approach to bilinear interpolation to quarter pel position.
    287 ;// 4 bytes are processed parallel
    288 ;//
    289 ;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
    290 ;// negating second operand to get one's complement (instead of 2's)
    291 ;// and using subtraction, EOR is used to correct sign.
    292 ;//
    293 ;// MVN     b, b
    294 ;// UHSUB8  a, a, b
    295 ;// EOR     a, a, 0x80808080
    296 
    297 
    298     LDR     ref, [sp, #0x1e4]           ;// ref
    299     LDR     tmpa, [sp, #0x228]          ;// horVerOffset
    300     LDR     mb, [sp, #0x1e8]            ;// mb
    301     LDR     width, [sp, #0x218]         ;// width
    302     ADD     ref, ref, #2                ;// calculate correct position
    303     AND     tmpa, tmpa, #1
    304     ADD     ref, ref, tmpa
    305     LDR     plus16, = 0x00100010        ;// +16 to lower and upperf halfwords
    306     AND     count, count, #0x00FFFFFF   ;// partWidth-1
    307 
    308     AND     tmpa, count, #0x000F0000    ;// partHeight-1
    309     ADD     count, count, tmpa, LSL #8
    310 
    311 loop_y
    312     ADD     count, count, tmp6, LSL #8  ;// partWidth-1
    313 
    314 loop_x
    315     LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
    316     LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
    317     LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
    318     LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
    319     LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
    320     LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
    321 
    322     ;// first four pixels
    323     UXTB16  tmpa, tmp3                  ;// |g3|g1|
    324     UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
    325     UXTB16  tmpb, tmp2                  ;// |c3|c1|
    326     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    327 
    328     UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
    329     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    330     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
    331     UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
    332 
    333     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    334     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    335 
    336     USAT16  tmpb, #13, tmpa             ;// saturate
    337     LDR     res, = 0x00FF00FF
    338     UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
    339     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
    340     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    341 
    342     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    343     UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
    344     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    345     UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
    346     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
    347     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
    348 
    349     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    350     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    351 
    352     USAT16  tmpb, #13, tmpa             ;// saturate
    353     LDR     tmp1, [mb]
    354     LDR     tmpa, = 0xFF00FF00
    355     MVN     tmp1, tmp1
    356     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
    357     ORR     res, res, tmpa
    358 
    359     LDR     tmpa, = 0x80808080
    360     UHSUB8  res, res, tmp1              ;// bilinear interpolation
    361     LDR     tmp1, [ref], width          ;// load next row
    362     EOR     res, res, tmpa              ;// correct sign
    363 
    364     STR     res, [mb], #16              ;// next row (mb)
    365 
    366 
    367     ;// tmp2 = |a4|a3|a2|a1|
    368     ;// tmp3 = |c4|c3|c2|c1|
    369     ;// tmp4 = |g4|g3|g2|g1|
    370     ;// tmp5 = |m4|m3|m2|m1|
    371     ;// tmp6 = |r4|r3|r2|r1|
    372     ;// tmp1 = |t4|t3|t2|t1|
    373 
    374     ;// second four pixels
    375     UXTB16  tmpa, tmp4                  ;// |g3|g1|
    376     UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
    377     UXTB16  tmpb, tmp3                  ;// |c3|c1|
    378     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    379     UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
    380     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    381     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
    382     UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
    383 
    384     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    385     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    386 
    387     USAT16  tmpb, #13, tmpa             ;// saturate
    388     LDR     res, = 0x00FF00FF
    389     UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
    390     UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
    391     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    392 
    393     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    394     UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
    395     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    396     UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
    397     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
    398     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
    399 
    400     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    401     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    402 
    403     USAT16  tmpb, #13, tmpa             ;// saturate
    404     LDR     tmp2, [mb]
    405     LDR     tmpa, = 0xFF00FF00
    406     MVN     tmp2, tmp2
    407 
    408     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    409     ORR     res, res, tmpa
    410     LDR     tmpa, = 0x80808080
    411     UHSUB8  res, res, tmp2              ;// bilinear interpolation
    412     LDR     tmp2, [ref], width          ;// load next row
    413     EOR     res, res, tmpa              ;// correct sign
    414     STR     res, [mb], #16              ;// next row
    415 
    416     ;// tmp3 = |a4|a3|a2|a1|
    417     ;// tmp4 = |c4|c3|c2|c1|
    418     ;// tmp5 = |g4|g3|g2|g1|
    419     ;// tmp6 = |m4|m3|m2|m1|
    420     ;// tmp1 = |r4|r3|r2|r1|
    421     ;// tmp2 = |t4|t3|t2|t1|
    422 
    423     ;// third four pixels
    424     UXTB16  tmpa, tmp5                  ;// |g3|g1|
    425     UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
    426     UXTB16  tmpb, tmp4                  ;// |c3|c1|
    427     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    428     UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
    429     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    430     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
    431     UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
    432 
    433     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    434     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    435 
    436     USAT16  tmpb, #13, tmpa             ;// saturate
    437     LDR     res, = 0x00FF00FF
    438     UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
    439     UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
    440     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    441 
    442     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    443     UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
    444     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    445     UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
    446     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
    447     UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
    448 
    449 
    450     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    451     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    452 
    453     USAT16  tmpb, #13, tmpa             ;// saturate
    454     LDR     tmp3, [mb]
    455     LDR     tmpa, = 0xFF00FF00
    456     MVN     tmp3, tmp3
    457 
    458     AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
    459     ORR     res, res, tmpa
    460     LDR     tmpa, = 0x80808080
    461     UHSUB8  res, res, tmp3              ;// bilinear interpolation
    462     LDR     tmp3, [ref]                 ;// load next row
    463     EOR     res, res, tmpa              ;// correct sign
    464     STR     res, [mb], #16              ;// next row
    465 
    466     ;// tmp4 = |a4|a3|a2|a1|
    467     ;// tmp5 = |c4|c3|c2|c1|
    468     ;// tmp6 = |g4|g3|g2|g1|
    469     ;// tmp1 = |m4|m3|m2|m1|
    470     ;// tmp2 = |r4|r3|r2|r1|
    471     ;// tmp3 = |t4|t3|t2|t1|
    472 
    473     ;// fourth four pixels
    474     UXTB16  tmpa, tmp6                  ;// |g3|g1|
    475     UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
    476     UXTB16  tmpb, tmp5                  ;// |c3|c1|
    477     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    478     UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
    479     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    480     UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
    481     UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
    482 
    483     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    484     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    485 
    486     USAT16  tmpb, #13, tmpa             ;// saturate
    487     LDR     res, = 0x00FF00FF
    488     UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
    489     UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
    490     AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
    491 
    492     ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
    493     UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
    494     ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
    495     UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
    496     UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
    497     UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
    498 
    499     ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
    500     SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
    501 
    502     USAT16  tmpb, #13, tmpa             ;// saturate
    503     LDR     tmp5, [mb]
    504     LDR     tmp4, = 0xFF00FF00
    505     MVN     tmp5, tmp5
    506 
    507     AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
    508     ORR     res, res, tmpa
    509     LDR     tmpa, = 0x80808080
    510     UHSUB8  res, res, tmp5              ;// bilinear interpolation
    511 
    512     ;// decrement loop_x counter
    513     SUBS    count, count, #4<<28        ;// decrement x loop counter
    514 
    515     ;// calculate "ref" address for next round
    516     SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
    517     ADD     ref, ref, #4                ;// next column (4 pixels)
    518 
    519     EOR     res, res, tmpa              ;// correct sign
    520     STR     res, [mb], #-44
    521 
    522     BCS     loop_x
    523 
    524     ADDS    mb, mb, #64                 ;// set Carry=0
    525     ADD     ref, ref, width, LSL #2     ;// ref += 4*width
    526     AND     tmp6, count, #0x00F00000    ;// partWidth-1
    527     SBC     ref, ref, tmp6, LSR #20     ;// -(partWidth-1)-1
    528     SBC     mb, mb, tmp6, LSR #20       ;// -(partWidth-1)-1
    529 
    530     ADDS    count, count, #0xC << 24    ;// decrement y loop counter
    531     BGE     loop_y
    532 
    533     ADD     sp, sp, #0x1f4
    534     LDMFD   sp!, {r4-r11, pc}
    535 
    536     END
    537