Home | History | Annotate | Download | only in libpixelflinger
      1 /* libs/pixelflinger/col32cb16blend_neon.S
      2  *
      3  * Copyright (C) 2009 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *      http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 
     19     .text
     20     .align
     21 
     22     .global scanline_col32cb16blend_neon
     23 
     24 //
     25 // This function alpha blends a fixed color into a destination scanline, using
     26 // the formula:
     27 //
     28 //     d = s + (((a + (a >> 7)) * d) >> 8)
     29 //
     30 // where d is the destination pixel,
     31 //       s is the source color,
     32 //       a is the alpha channel of the source color.
     33 //
     34 // The NEON implementation processes 16 pixels per iteration. The remaining 0 - 15
     35 // pixels are processed in ARM code.
     36 //
     37 
     38 // r0 = destination buffer pointer
     39 // r1 = color pointer
     40 // r2 = count
     41 
     42 
     43 scanline_col32cb16blend_neon:
     44     push        {r4-r11, lr}                    // stack ARM regs
     45 
     46     vmov.u16    q15, #256                       // create alpha constant
     47     movs        r3, r2, lsr #4                  // calc. sixteens iterations
     48     vmov.u16    q14, #0x1f                      // create blue mask
     49 
     50     beq         2f                              // if r3 == 0, branch to singles
     51 
     52     vld4.8      {d0[], d2[], d4[], d6[]}, [r1]  // load color into four registers
     53                                                 //  split and duplicate them, such that
     54                                                 //  d0 = 8 equal red values
     55                                                 //  d2 = 8 equal green values
     56                                                 //  d4 = 8 equal blue values
     57                                                 //  d6 = 8 equal alpha values
     58     vshll.u8    q0, d0, #5                      // shift up red and widen
     59     vshll.u8    q1, d2, #6                      // shift up green and widen
     60     vshll.u8    q2, d4, #5                      // shift up blue and widen
     61 
     62     vshr.u8     d7, d6, #7                      // extract top bit of alpha
     63     vaddl.u8    q3, d6, d7                      // add top bit into alpha
     64     vsub.u16    q3, q15, q3                     // invert alpha
     65 
     66 1:
     67     // This loop processes 16 pixels per iteration. In the comments, references to
     68     // the first eight pixels are suffixed with "0" (red0, green0, blue0),
     69     // the second eight are suffixed "1".
     70                                                 // q8  = dst red0
     71                                                 // q9  = dst green0
     72                                                 // q10 = dst blue0
     73                                                 // q13 = dst red1
     74                                                 // q12 = dst green1
     75                                                 // q11 = dst blue1
     76 
     77     vld1.16     {d20, d21, d22, d23}, [r0]      // load 16 dest pixels
     78     vshr.u16    q8, q10, #11                    // shift dst red0 to low 5 bits
     79     pld         [r0, #63]                       // preload next dest pixels
     80     vshl.u16    q9, q10, #5                     // shift dst green0 to top 6 bits
     81     vand        q10, q10, q14                   // extract dst blue0
     82     vshr.u16    q9, q9, #10                     // shift dst green0 to low 6 bits
     83     vmul.u16    q8, q8, q3                      // multiply dst red0 by src alpha
     84     vshl.u16    q12, q11, #5                    // shift dst green1 to top 6 bits
     85     vmul.u16    q9, q9, q3                      // multiply dst green0 by src alpha
     86     vshr.u16    q13, q11, #11                   // shift dst red1 to low 5 bits
     87     vmul.u16    q10, q10, q3                    // multiply dst blue0 by src alpha
     88     vshr.u16    q12, q12, #10                   // shift dst green1 to low 6 bits
     89     vand        q11, q11, q14                   // extract dst blue1
     90     vadd.u16    q8, q8, q0                      // add src red to dst red0
     91     vmul.u16    q13, q13, q3                    // multiply dst red1 by src alpha
     92     vadd.u16    q9, q9, q1                      // add src green to dst green0
     93     vmul.u16    q12, q12, q3                    // multiply dst green1 by src alpha
     94     vadd.u16    q10, q10, q2                    // add src blue to dst blue0
     95     vmul.u16    q11, q11, q3                    // multiply dst blue1 by src alpha
     96     vshr.u16    q8, q8, #8                      // shift down red0
     97     vadd.u16    q13, q13, q0                    // add src red to dst red1
     98     vshr.u16    q9, q9, #8                      // shift down green0
     99     vadd.u16    q12, q12, q1                    // add src green to dst green1
    100     vshr.u16    q10, q10, #8                    // shift down blue0
    101     vadd.u16    q11, q11, q2                    // add src blue to dst blue1
    102     vsli.u16    q10, q9, #5                     // shift & insert green0 into blue0
    103     vshr.u16    q13, q13, #8                    // shift down red1
    104     vsli.u16    q10, q8, #11                    // shift & insert red0 into blue0
    105     vshr.u16    q12, q12, #8                    // shift down green1
    106     vshr.u16    q11, q11, #8                    // shift down blue1
    107     subs        r3, r3, #1                      // decrement loop counter
    108     vsli.u16    q11, q12, #5                    // shift & insert green1 into blue1
    109     vsli.u16    q11, q13, #11                   // shift & insert red1 into blue1
    110 
    111     vst1.16     {d20, d21, d22, d23}, [r0]!     // write 16 pixels back to dst
    112     bne         1b                              // if count != 0, loop
    113 
    114 2:
    115     ands        r3, r2, #15                     // calc. single iterations
    116     beq         4f                              // if r3 == 0, exit
    117 
    118     ldr         r4, [r1]                        // load source color
    119     mov         r5, r4, lsr #24                 // shift down alpha
    120     add         r5, r5, r5, lsr #7              // add in top bit
    121     rsb         r5, r5, #256                    // invert alpha
    122     and         r11, r4, #0xff                  // extract red
    123     ubfx        r12, r4, #8, #8                 // extract green
    124     ubfx        r4, r4, #16, #8                 // extract blue
    125     mov         r11, r11, lsl #5                // prescale red
    126     mov         r12, r12, lsl #6                // prescale green
    127     mov         r4, r4, lsl #5                  // prescale blue
    128 
    129 3:
    130     ldrh        r8, [r0]                        // load dest pixel
    131     subs        r3, r3, #1                      // decrement loop counter
    132     mov         r6, r8, lsr #11                 // extract dest red
    133     ubfx        r7, r8, #5, #6                  // extract dest green
    134     and         r8, r8, #0x1f                   // extract dest blue
    135 
    136     smlabb      r6, r6, r5, r11                 // dest red * alpha + src red
    137     smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
    138     smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue
    139 
    140     mov         r6, r6, lsr #8                  // shift down red
    141     mov         r7, r7, lsr #8                  // shift down green
    142     mov         r6, r6, lsl #11                 // shift red into 565
    143     orr         r6, r7, lsl #5                  // shift green into 565
    144     orr         r6, r8, lsr #8                  // shift blue into 565
    145 
    146     strh        r6, [r0], #2                    // store pixel to dest, update ptr
    147     bne         3b                              // if count != 0, loop
    148 4:
    149 
    150     pop         {r4-r11, pc}                    // return
    151 
    152 
    153 
    154