1 /* libs/pixelflinger/t32cb16blend.S 2 ** 3 ** Copyright 2006, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 19 .text 20 .syntax unified 21 .align 22 23 .global scanline_t32cb16blend_arm 24 25 26 /* 27 * .macro pixel 28 * 29 * \DREG is a 32-bit register containing *two* original destination RGB565 30 * pixels, with the even one in the low-16 bits, and the odd one in the 31 * high 16 bits. 32 * 33 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. 34 * 35 * \FB is a target register that will contain the blended pixel values. 36 * 37 * \ODD is either 0 or 1 and indicates if we're blending the lower or 38 * upper 16-bit pixels in DREG into FB 39 * 40 * 41 * clobbered: r6, r7, lr 42 * 43 */ 44 45 .macro pixel, DREG, SRC, FB, ODD 46 47 // SRC = 0xAABBGGRR 48 mov r7, \SRC, lsr #24 // sA 49 add r7, r7, r7, lsr #7 // sA + (sA >> 7) 50 rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) 51 52 1: 53 54 .if \ODD 55 56 // red 57 mov lr, \DREG, lsr #(16 + 11) 58 smulbb lr, r7, lr 59 mov r6, \SRC, lsr #3 60 and r6, r6, #0x1F 61 add lr, r6, lr, lsr #8 62 cmp lr, #0x1F 63 orrhs \FB, \FB, #(0x1F<<(16 + 11)) 64 orrlo \FB, \FB, lr, lsl #(16 + 11) 65 66 // green 67 and r6, \DREG, #(0x3F<<(16 + 5)) 68 smulbt r6, r7, r6 69 mov lr, \SRC, lsr #(8+2) 70 and lr, lr, #0x3F 71 add r6, lr, r6, lsr #(5+8) 72 cmp r6, #0x3F 73 orrhs \FB, \FB, #(0x3F<<(16 + 5)) 74 orrlo \FB, \FB, r6, lsl #(16 + 5) 75 76 // blue 77 and lr, \DREG, #(0x1F << 16) 78 smulbt lr, r7, lr 79 mov r6, \SRC, lsr #(8+8+3) 80 and r6, r6, #0x1F 81 add lr, r6, lr, lsr #8 82 cmp lr, #0x1F 83 orrhs \FB, \FB, #(0x1F << 16) 84 orrlo \FB, \FB, lr, lsl #16 85 86 .else 87 88 // red 89 mov lr, \DREG, lsr #11 90 and lr, lr, #0x1F 91 smulbb lr, r7, lr 92 mov r6, \SRC, lsr #3 93 and r6, r6, #0x1F 94 add lr, r6, lr, lsr #8 95 cmp lr, #0x1F 96 movhs \FB, #(0x1F<<11) 97 movlo \FB, lr, lsl #11 98 99 100 // green 101 and r6, \DREG, #(0x3F<<5) 102 smulbb r6, r7, r6 103 mov lr, \SRC, lsr #(8+2) 104 and lr, lr, #0x3F 105 add r6, lr, r6, lsr #(5+8) 106 cmp r6, #0x3F 107 orrhs \FB, \FB, #(0x3F<<5) 108 orrlo \FB, \FB, r6, lsl #5 109 110 // blue 111 and lr, \DREG, #0x1F 112 smulbb lr, r7, lr 113 mov r6, \SRC, lsr #(8+8+3) 114 and r6, r6, #0x1F 115 add lr, r6, lr, lsr #8 116 cmp lr, #0x1F 117 orrhs \FB, \FB, #0x1F 118 orrlo \FB, \FB, lr 119 120 .endif 121 122 .endm 123 124 125 // r0: dst ptr 126 // r1: src ptr 127 // r2: count 128 // r3: d 129 // r4: s0 130 // r5: s1 131 // r6: pixel 132 // r7: pixel 133 // r8: free 134 // r9: free 135 // r10: free 136 // r11: free 137 // r12: scratch 138 // r14: pixel 139 140 scanline_t32cb16blend_arm: 141 stmfd sp!, {r4-r7, lr} 142 143 pld [r0] 144 pld [r1] 145 146 // align DST to 32 bits 147 tst r0, #0x3 148 beq aligned 149 subs r2, r2, #1 150 ldmfdlo sp!, {r4-r7, lr} // return 151 bxlo lr 152 153 last: 154 ldr r4, [r1], #4 155 ldrh r3, [r0] 156 pixel r3, r4, r12, 0 157 strh r12, [r0], #2 158 159 aligned: 160 subs r2, r2, #2 161 blo 9f 162 163 // The main loop is unrolled twice and processes 4 pixels 164 8: ldmia r1!, {r4, r5} 165 // stream the source 166 pld [r1, #32] 167 add r0, r0, #4 168 // it's all zero, skip this pixel 169 orrs r3, r4, r5 170 beq 7f 171 172 // load the destination 173 ldr r3, [r0, #-4] 174 // stream the destination 175 pld [r0, #32] 176 pixel r3, r4, r12, 0 177 pixel r3, r5, r12, 1 178 // effectively, we're getting write-combining by virtue of the 179 // cpu's write-back cache. 180 str r12, [r0, #-4] 181 182 // 2nd iterration of the loop, don't stream anything 183 subs r2, r2, #2 184 movlt r4, r5 185 blt 9f 186 ldmia r1!, {r4, r5} 187 add r0, r0, #4 188 orrs r3, r4, r5 189 beq 7f 190 ldr r3, [r0, #-4] 191 pixel r3, r4, r12, 0 192 pixel r3, r5, r12, 16 193 str r12, [r0, #-4] 194 195 196 7: subs r2, r2, #2 197 bhs 8b 198 mov r4, r5 199 200 9: adds r2, r2, #1 201 ldmfdlo sp!, {r4-r7, lr} // return 202 bxlo lr 203 b last 204