Home | History | Annotate | Download | only in libpixelflinger
      1 /* libs/pixelflinger/t32cb16blend.S
      2 **
      3 ** Copyright 2006, The Android Open Source Project
      4 **
      5 ** Licensed under the Apache License, Version 2.0 (the "License");
      6 ** you may not use this file except in compliance with the License.
      7 ** You may obtain a copy of the License at
      8 **
      9 **     http://www.apache.org/licenses/LICENSE-2.0
     10 **
     11 ** Unless required by applicable law or agreed to in writing, software
     12 ** distributed under the License is distributed on an "AS IS" BASIS,
     13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ** See the License for the specific language governing permissions and
     15 ** limitations under the License.
     16 */
     17 
     18 
     19 	.text
     20 	.syntax unified
     21 	.balign 4
     22 
     23 	.global scanline_t32cb16blend_arm
     24 
     25 
     26 /*
     27  * .macro pixel
     28  *
     29  * \DREG is a 32-bit register containing *two* original destination RGB565
     30  *       pixels, with the even one in the low-16 bits, and the odd one in the
     31  *       high 16 bits.
     32  *
     33  * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
     34  *
     35  * \FB is a target register that will contain the blended pixel values.
     36  *
     37  * \ODD is either 0 or 1 and indicates if we're blending the lower or
     38  *      upper 16-bit pixels in DREG into FB
     39  *
     40  *
     41  * clobbered: r6, r7, lr
     42  *
     43  */
     44 
     45 .macro pixel,   DREG, SRC, FB, ODD
     46 
     47     // SRC = 0xAABBGGRR
     48     mov     r7, \SRC, lsr #24           // sA
     49     add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
     50     rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
     51 
     52 1:
     53 
     54 .if \ODD
     55 
     56     // red
     57     mov     lr, \DREG, lsr #(16 + 11)
     58     smulbb  lr, r7, lr
     59     mov     r6, \SRC, lsr #3
     60     and     r6, r6, #0x1F
     61     add     lr, r6, lr, lsr #8
     62     cmp     lr, #0x1F
     63     orrhs   \FB, \FB, #(0x1F<<(16 + 11))
     64     orrlo   \FB, \FB, lr, lsl #(16 + 11)
     65 
     66         // green
     67         and     r6, \DREG, #(0x3F<<(16 + 5))
     68         smulbt  r6, r7, r6
     69         mov     lr, \SRC, lsr #(8+2)
     70         and     lr, lr, #0x3F
     71         add     r6, lr, r6, lsr #(5+8)
     72         cmp     r6, #0x3F
     73         orrhs   \FB, \FB, #(0x3F<<(16 + 5))
     74         orrlo   \FB, \FB, r6, lsl #(16 + 5)
     75 
     76             // blue
     77             and     lr, \DREG, #(0x1F << 16)
     78             smulbt  lr, r7, lr
     79             mov     r6, \SRC, lsr #(8+8+3)
     80             and     r6, r6, #0x1F
     81             add     lr, r6, lr, lsr #8
     82             cmp     lr, #0x1F
     83             orrhs   \FB, \FB, #(0x1F << 16)
     84             orrlo   \FB, \FB, lr, lsl #16
     85 
     86 .else
     87 
     88     // red
     89     mov     lr, \DREG, lsr #11
     90     and     lr, lr, #0x1F
     91     smulbb  lr, r7, lr
     92     mov     r6, \SRC, lsr #3
     93     and     r6, r6, #0x1F
     94     add     lr, r6, lr, lsr #8
     95     cmp     lr, #0x1F
     96     movhs   \FB, #(0x1F<<11)
     97     movlo   \FB, lr, lsl #11
     98 
     99 
    100         // green
    101         and     r6, \DREG, #(0x3F<<5)
    102         smulbb  r6, r7, r6
    103         mov     lr, \SRC, lsr #(8+2)
    104         and     lr, lr, #0x3F
    105         add     r6, lr, r6, lsr #(5+8)
    106         cmp     r6, #0x3F
    107         orrhs   \FB, \FB, #(0x3F<<5)
    108         orrlo   \FB, \FB, r6, lsl #5
    109 
    110             // blue
    111             and     lr, \DREG, #0x1F
    112             smulbb  lr, r7, lr
    113             mov     r6, \SRC, lsr #(8+8+3)
    114             and     r6, r6, #0x1F
    115             add     lr, r6, lr, lsr #8
    116             cmp     lr, #0x1F
    117             orrhs   \FB, \FB, #0x1F
    118             orrlo   \FB, \FB, lr
    119 
    120 .endif
    121 
    122     .endm
    123 
    124 
    125 // r0:  dst ptr
    126 // r1:  src ptr
    127 // r2:  count
    128 // r3:  d
    129 // r4:  s0
    130 // r5:  s1
    131 // r6:  pixel
    132 // r7:  pixel
    133 // r8:  free
    134 // r9:  free
    135 // r10: free
    136 // r11: free
    137 // r12: scratch
    138 // r14: pixel
    139 
    140 scanline_t32cb16blend_arm:
    141     stmfd	sp!, {r4-r7, lr}
    142 
    143     pld     [r0]
    144     pld     [r1]
    145 
    146     // align DST to 32 bits
    147     tst     r0, #0x3
    148     beq     aligned
    149     subs    r2, r2, #1
    150     ldmfdlo sp!, {r4-r7, lr}        // return
    151     bxlo    lr
    152 
    153 last:
    154     ldr     r4, [r1], #4
    155     ldrh    r3, [r0]
    156     pixel   r3, r4, r12, 0
    157     strh    r12, [r0], #2
    158 
    159 aligned:
    160     subs    r2, r2, #2
    161     blo     9f
    162 
    163     // The main loop is unrolled twice and processes 4 pixels
    164 8:  ldmia   r1!, {r4, r5}
    165     // stream the source
    166     pld     [r1, #32]
    167     add     r0, r0, #4
    168     // it's all zero, skip this pixel
    169     orrs    r3, r4, r5
    170     beq     7f
    171 
    172     // load the destination
    173     ldr     r3, [r0, #-4]
    174     // stream the destination
    175     pld     [r0, #32]
    176     pixel   r3, r4, r12, 0
    177     pixel   r3, r5, r12, 1
    178     // effectively, we're getting write-combining by virtue of the
    179     // cpu's write-back cache.
    180     str     r12, [r0, #-4]
    181 
    182     // 2nd iterration of the loop, don't stream anything
    183     subs    r2, r2, #2
    184     movlt   r4, r5
    185     blt     9f
    186     ldmia   r1!, {r4, r5}
    187     add     r0, r0, #4
    188     orrs    r3, r4, r5
    189     beq     7f
    190     ldr     r3, [r0, #-4]
    191     pixel   r3, r4, r12, 0
    192     pixel   r3, r5, r12, 16
    193     str     r12, [r0, #-4]
    194 
    195 
    196 7:  subs    r2, r2, #2
    197     bhs     8b
    198     mov     r4, r5
    199 
    200 9:  adds    r2, r2, #1
    201     ldmfdlo sp!, {r4-r7, lr}        // return
    202     bxlo    lr
    203     b       last
    204