1 /* 2 ** 3 ** Copyright 2009, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #ifdef ANDROID 19 #include <machine/cpu-features.h> 20 #endif 21 22 #include "SkBlitRow.h" 23 #include "SkColorPriv.h" 24 #include "SkDither.h" 25 26 #if defined(__ARM_HAVE_NEON) 27 #include <arm_neon.h> 28 #endif 29 30 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 31 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 32 const SkPMColor* SK_RESTRICT src, int count, 33 U8CPU alpha, int /*x*/, int /*y*/) { 34 SkASSERT(255 == alpha); 35 36 if (count >= 8) { 37 uint16_t* SK_RESTRICT keep_dst; 38 39 asm volatile ( 40 "ands ip, %[count], #7 \n\t" 41 "vmov.u8 d31, #1<<7 \n\t" 42 "vld1.16 {q12}, [%[dst]] \n\t" 43 "vld4.8 {d0-d3}, [%[src]] \n\t" 44 "moveq ip, #8 \n\t" 45 "mov %[keep_dst], %[dst] \n\t" 46 47 "add %[src], %[src], ip, LSL#2 \n\t" 48 "add %[dst], %[dst], ip, LSL#1 \n\t" 49 "subs %[count], %[count], ip \n\t" 50 "b 9f \n\t" 51 // LOOP 52 "2: \n\t" 53 54 "vld1.16 {q12}, [%[dst]]! \n\t" 55 "vld4.8 {d0-d3}, [%[src]]! \n\t" 56 "vst1.16 {q10}, [%[keep_dst]] \n\t" 57 "sub %[keep_dst], %[dst], #8*2 \n\t" 58 "subs %[count], %[count], #8 \n\t" 59 "9: \n\t" 60 "pld [%[dst],#32] \n\t" 61 // expand 0565 q12 to 8888 {d4-d7} 62 "vmovn.u16 d4, q12 \n\t" 63 "vshr.u16 q11, q12, #5 \n\t" 64 "vshr.u16 q10, q12, #6+5 \n\t" 65 "vmovn.u16 d5, q11 \n\t" 66 "vmovn.u16 d6, q10 \n\t" 67 "vshl.u8 d4, d4, #3 \n\t" 68 "vshl.u8 d5, d5, #2 \n\t" 69 "vshl.u8 d6, d6, #3 \n\t" 70 71 "vmovl.u8 q14, d31 \n\t" 72 "vmovl.u8 q13, d31 \n\t" 73 "vmovl.u8 q12, d31 \n\t" 74 75 // duplicate in 4/2/1 & 8pix vsns 76 "vmvn.8 d30, d3 \n\t" 77 "vmlal.u8 q14, d30, d6 \n\t" 78 "vmlal.u8 q13, d30, d5 \n\t" 79 "vmlal.u8 q12, d30, d4 \n\t" 80 "vshr.u16 q8, q14, #5 \n\t" 81 "vshr.u16 q9, q13, #6 \n\t" 82 "vaddhn.u16 d6, q14, q8 \n\t" 83 "vshr.u16 q8, q12, #5 \n\t" 84 "vaddhn.u16 d5, q13, q9 \n\t" 85 "vqadd.u8 d6, d6, d0 \n\t" // moved up 86 "vaddhn.u16 d4, q12, q8 \n\t" 87 // intentionally don't calculate alpha 88 // result in d4-d6 89 90 "vqadd.u8 d5, d5, d1 \n\t" 91 "vqadd.u8 d4, d4, d2 \n\t" 92 93 // pack 8888 {d4-d6} to 0565 q10 94 "vshll.u8 q10, d6, #8 \n\t" 95 "vshll.u8 q3, d5, #8 \n\t" 96 "vshll.u8 q2, d4, #8 \n\t" 97 "vsri.u16 q10, q3, #5 \n\t" 98 "vsri.u16 q10, q2, #11 \n\t" 99 100 "bne 2b \n\t" 101 102 "1: \n\t" 103 "vst1.16 {q10}, [%[keep_dst]] \n\t" 104 : [count] "+r" (count) 105 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 106 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 107 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 108 "d30","d31" 109 ); 110 } 111 else 112 { // handle count < 8 113 uint16_t* SK_RESTRICT keep_dst; 114 115 asm volatile ( 116 "vmov.u8 d31, #1<<7 \n\t" 117 "mov %[keep_dst], %[dst] \n\t" 118 119 "tst %[count], #4 \n\t" 120 "beq 14f \n\t" 121 "vld1.16 {d25}, [%[dst]]! \n\t" 122 "vld1.32 {q1}, [%[src]]! \n\t" 123 124 "14: \n\t" 125 "tst %[count], #2 \n\t" 126 "beq 12f \n\t" 127 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 128 "vld1.32 {d1}, [%[src]]! \n\t" 129 130 "12: \n\t" 131 "tst %[count], #1 \n\t" 132 "beq 11f \n\t" 133 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 134 "vld1.32 {d0[1]}, [%[src]]! \n\t" 135 136 "11: \n\t" 137 // unzips achieve the same as a vld4 operation 138 "vuzpq.u16 q0, q1 \n\t" 139 "vuzp.u8 d0, d1 \n\t" 140 "vuzp.u8 d2, d3 \n\t" 141 // expand 0565 q12 to 8888 {d4-d7} 142 "vmovn.u16 d4, q12 \n\t" 143 "vshr.u16 q11, q12, #5 \n\t" 144 "vshr.u16 q10, q12, #6+5 \n\t" 145 "vmovn.u16 d5, q11 \n\t" 146 "vmovn.u16 d6, q10 \n\t" 147 "vshl.u8 d4, d4, #3 \n\t" 148 "vshl.u8 d5, d5, #2 \n\t" 149 "vshl.u8 d6, d6, #3 \n\t" 150 151 "vmovl.u8 q14, d31 \n\t" 152 "vmovl.u8 q13, d31 \n\t" 153 "vmovl.u8 q12, d31 \n\t" 154 155 // duplicate in 4/2/1 & 8pix vsns 156 "vmvn.8 d30, d3 \n\t" 157 "vmlal.u8 q14, d30, d6 \n\t" 158 "vmlal.u8 q13, d30, d5 \n\t" 159 "vmlal.u8 q12, d30, d4 \n\t" 160 "vshr.u16 q8, q14, #5 \n\t" 161 "vshr.u16 q9, q13, #6 \n\t" 162 "vaddhn.u16 d6, q14, q8 \n\t" 163 "vshr.u16 q8, q12, #5 \n\t" 164 "vaddhn.u16 d5, q13, q9 \n\t" 165 "vqadd.u8 d6, d6, d0 \n\t" // moved up 166 "vaddhn.u16 d4, q12, q8 \n\t" 167 // intentionally don't calculate alpha 168 // result in d4-d6 169 170 "vqadd.u8 d5, d5, d1 \n\t" 171 "vqadd.u8 d4, d4, d2 \n\t" 172 173 // pack 8888 {d4-d6} to 0565 q10 174 "vshll.u8 q10, d6, #8 \n\t" 175 "vshll.u8 q3, d5, #8 \n\t" 176 "vshll.u8 q2, d4, #8 \n\t" 177 "vsri.u16 q10, q3, #5 \n\t" 178 "vsri.u16 q10, q2, #11 \n\t" 179 180 // store 181 "tst %[count], #4 \n\t" 182 "beq 24f \n\t" 183 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 184 185 "24: \n\t" 186 "tst %[count], #2 \n\t" 187 "beq 22f \n\t" 188 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 189 190 "22: \n\t" 191 "tst %[count], #1 \n\t" 192 "beq 21f \n\t" 193 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 194 195 "21: \n\t" 196 : [count] "+r" (count) 197 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 198 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 199 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 200 "d30","d31" 201 ); 202 } 203 } 204 205 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 206 const SkPMColor* SK_RESTRICT src, int count, 207 U8CPU alpha, int /*x*/, int /*y*/) { 208 209 U8CPU alpha_for_asm = alpha; 210 211 asm volatile ( 212 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 213 * the original in two respects: 214 * 1. The results have a few mismatches compared to the original code. These mismatches 215 * never exceed 1. It's possible to improve accuracy vs. a floating point 216 * implementation by introducing rounding right shifts (vrshr) for the final stage. 217 * Rounding is not present in the code below, because although results would be closer 218 * to a floating point implementation, the number of mismatches compared to the 219 * original code would be far greater. 220 * 2. On certain inputs, the original code can overflow, causing colour channels to 221 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 222 * to affect another. 223 */ 224 225 #if 1 226 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 227 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 228 #else 229 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 230 #endif 231 "vmov.u16 q3, #255 \n\t" // set up constant 232 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 233 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 234 "beq 2f \n\t" // if count8 == 0, exit 235 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 236 237 "1: \n\t" 238 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 239 "subs r4, r4, #1 \n\t" // decrement loop counter 240 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 241 // and deinterleave 242 243 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 244 "vand q10, q0, q15 \n\t" // extract blue 245 "vshr.u16 q8, q0, #11 \n\t" // extract red 246 "vshr.u16 q9, q9, #10 \n\t" // extract green 247 // dstrgb = {q8, q9, q10} 248 249 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 250 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 251 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 252 253 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 254 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 255 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 256 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 257 // srcrgba = {q11, q12, q13, q14} 258 259 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 260 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 261 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 262 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 263 264 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 265 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 266 // dst_scale = q2 267 268 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 269 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 270 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 271 272 #if 1 273 // trying for a better match with SkDiv255Round(a) 274 // C alg is: a+=128; (a+a>>8)>>8 275 // we'll use just a rounding shift [q2 is available for scratch] 276 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 277 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 278 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 279 #else 280 // arm's original "truncating divide by 256" 281 "vshr.u16 q11, q11, #8 \n\t" // shift down red 282 "vshr.u16 q12, q12, #8 \n\t" // shift down green 283 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 284 #endif 285 286 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 287 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 288 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 289 290 "bne 1b \n\t" // if counter != 0, loop 291 "2: \n\t" // exit 292 293 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 294 : 295 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 296 ); 297 298 count &= 7; 299 if (count > 0) { 300 do { 301 SkPMColor sc = *src++; 302 if (sc) { 303 uint16_t dc = *dst; 304 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 305 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 306 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 307 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 308 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 309 } 310 dst += 1; 311 } while (--count != 0); 312 } 313 } 314 315 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 316 * each dither value is spaced out into byte lanes, and repeated 317 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 318 * start of each row. 319 */ 320 static const uint8_t gDitherMatrix_Neon[48] = { 321 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 322 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 323 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 324 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 325 326 }; 327 328 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 329 int count, U8CPU alpha, int x, int y) 330 { 331 /* select row and offset for dither array */ 332 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 333 334 /* rescale alpha to range 0 - 256 */ 335 int scale = SkAlpha255To256(alpha); 336 337 asm volatile ( 338 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 339 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 340 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 341 "vmov.i8 d29, #0x3f \n\t" // set up green mask 342 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 343 "1: \n\t" 344 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 345 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 346 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 347 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 348 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 349 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 350 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 351 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 352 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 353 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 354 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 355 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 356 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 357 // load 8 pixels from dst, extract rgb 358 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 359 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 360 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 361 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 362 "vand d17, d17, d29 \n\t" // and green with green mask 363 "vand d18, d18, d28 \n\t" // and blue with blue mask 364 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 365 // src = {d22 (r), d23 (g), d24 (b)} 366 // dst = {d16 (r), d17 (g), d18 (b)} 367 // subtract dst from src and widen 368 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 369 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 370 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 371 // multiply diffs by scale and shift 372 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 373 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 374 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 375 "subs %[count], %[count], #8 \n\t" // decrement loop counter 376 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 377 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 378 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 379 // add dst to result 380 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 381 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 382 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 383 // put result into 565 format 384 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 385 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 386 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 387 "bgt 1b \n\t" // loop if count > 0 388 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 389 : [dstart] "r" (dstart), [scale] "r" (scale) 390 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 391 ); 392 393 DITHER_565_SCAN(y); 394 395 while((count & 7) > 0) 396 { 397 SkPMColor c = *src++; 398 399 int dither = DITHER_VALUE(x); 400 int sr = SkGetPackedR32(c); 401 int sg = SkGetPackedG32(c); 402 int sb = SkGetPackedB32(c); 403 sr = SkDITHER_R32To565(sr, dither); 404 sg = SkDITHER_G32To565(sg, dither); 405 sb = SkDITHER_B32To565(sb, dither); 406 407 uint16_t d = *dst; 408 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 409 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 410 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 411 DITHER_INC_X(x); 412 count--; 413 } 414 } 415 416 #define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon 417 #define S32A_D565_Blend_PROC S32A_D565_Blend_neon 418 #define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon 419 #else 420 #define S32A_D565_Opaque_PROC NULL 421 #define S32A_D565_Blend_PROC NULL 422 #define S32_D565_Blend_Dither_PROC NULL 423 #endif 424 425 /* Don't have a special version that assumes each src is opaque, but our S32A 426 is still faster than the default, so use it here 427 */ 428 #define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC 429 #define S32_D565_Blend_PROC S32A_D565_Blend_PROC 430 431 /////////////////////////////////////////////////////////////////////////////// 432 433 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 434 435 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 436 const SkPMColor* SK_RESTRICT src, 437 int count, U8CPU alpha) { 438 439 SkASSERT(255 == alpha); 440 if (count > 0) { 441 442 443 uint8x8_t alpha_mask; 444 445 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 446 alpha_mask = vld1_u8(alpha_mask_setup); 447 448 /* do the NEON unrolled code */ 449 #define UNROLL 4 450 while (count >= UNROLL) { 451 uint8x8_t src_raw, dst_raw, dst_final; 452 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 453 454 /* get the source */ 455 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 456 #if UNROLL > 2 457 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 458 #endif 459 460 /* get and hold the dst too */ 461 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 462 #if UNROLL > 2 463 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 464 #endif 465 466 /* 1st and 2nd bits of the unrolling */ 467 { 468 uint8x8_t dst_cooked; 469 uint16x8_t dst_wide; 470 uint8x8_t alpha_narrow; 471 uint16x8_t alpha_wide; 472 473 /* get the alphas spread out properly */ 474 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 475 #if 1 476 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 477 /* we collapsed (255-a)+1 ... */ 478 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 479 #else 480 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 481 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 482 #endif 483 484 /* spread the dest */ 485 dst_wide = vmovl_u8(dst_raw); 486 487 /* alpha mul the dest */ 488 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 489 dst_cooked = vshrn_n_u16(dst_wide, 8); 490 491 /* sum -- ignoring any byte lane overflows */ 492 dst_final = vadd_u8(src_raw, dst_cooked); 493 } 494 495 #if UNROLL > 2 496 /* the 3rd and 4th bits of our unrolling */ 497 { 498 uint8x8_t dst_cooked; 499 uint16x8_t dst_wide; 500 uint8x8_t alpha_narrow; 501 uint16x8_t alpha_wide; 502 503 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 504 #if 1 505 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 506 /* we collapsed (255-a)+1 ... */ 507 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 508 #else 509 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 510 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 511 #endif 512 513 /* spread the dest */ 514 dst_wide = vmovl_u8(dst_raw_2); 515 516 /* alpha mul the dest */ 517 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 518 dst_cooked = vshrn_n_u16(dst_wide, 8); 519 520 /* sum -- ignoring any byte lane overflows */ 521 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 522 } 523 #endif 524 525 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 526 #if UNROLL > 2 527 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 528 #endif 529 530 src += UNROLL; 531 dst += UNROLL; 532 count -= UNROLL; 533 } 534 #undef UNROLL 535 536 /* do any residual iterations */ 537 while (--count >= 0) { 538 #ifdef TEST_SRC_ALPHA 539 SkPMColor sc = *src; 540 if (sc) { 541 unsigned srcA = SkGetPackedA32(sc); 542 SkPMColor result = sc; 543 if (srcA != 255) { 544 result = SkPMSrcOver(sc, *dst); 545 } 546 *dst = result; 547 } 548 #else 549 *dst = SkPMSrcOver(*src, *dst); 550 #endif 551 src += 1; 552 dst += 1; 553 } 554 } 555 } 556 557 #define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon 558 #else 559 #define S32A_Opaque_BlitRow32_PROC NULL 560 #endif 561 562 /* Neon version of S32_Blend_BlitRow32() 563 * portable version is in src/core/SkBlitRow_D32.cpp 564 */ 565 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 566 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 567 const SkPMColor* SK_RESTRICT src, 568 int count, U8CPU alpha) { 569 SkASSERT(alpha <= 255); 570 if (count > 0) { 571 uint16_t src_scale = SkAlpha255To256(alpha); 572 uint16_t dst_scale = 256 - src_scale; 573 574 /* run them N at a time through the NEON unit */ 575 /* note that each 1 is 4 bytes, each treated exactly the same, 576 * so we can work under that guise. We *do* know that the src&dst 577 * will be 32-bit aligned quantities, so we can specify that on 578 * the load/store ops and do a neon 'reinterpret' to get us to 579 * byte-sized (pun intended) pieces that we widen/multiply/shift 580 * we're limited at 128 bits in the wide ops, which is 8x16bits 581 * or a pair of 32 bit src/dsts. 582 */ 583 /* we *could* manually unroll this loop so that we load 128 bits 584 * (as a pair of 64s) from each of src and dst, processing them 585 * in pieces. This might give us a little better management of 586 * the memory latency, but my initial attempts here did not 587 * produce an instruction stream that looked all that nice. 588 */ 589 #define UNROLL 2 590 while (count >= UNROLL) { 591 uint8x8_t src_raw, dst_raw, dst_final; 592 uint16x8_t src_wide, dst_wide; 593 594 /* get 64 bits of src, widen it, multiply by src_scale */ 595 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 596 src_wide = vmovl_u8(src_raw); 597 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 598 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 599 600 /* ditto with dst */ 601 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 602 dst_wide = vmovl_u8(dst_raw); 603 604 /* combine add with dst multiply into mul-accumulate */ 605 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 606 607 dst_final = vshrn_n_u16(dst_wide, 8); 608 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 609 610 src += UNROLL; 611 dst += UNROLL; 612 count -= UNROLL; 613 } 614 /* RBE: well, i don't like how gcc manages src/dst across the above 615 * loop it's constantly calculating src+bias, dst+bias and it only 616 * adjusts the real ones when we leave the loop. Not sure why 617 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 618 * the adjustments to src/dst/count, but it does... 619 * (might be SSA-style internal logic... 620 */ 621 622 #if UNROLL == 2 623 if (count == 1) { 624 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 625 } 626 #else 627 if (count > 0) { 628 do { 629 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 630 src += 1; 631 dst += 1; 632 } while (--count > 0); 633 } 634 #endif 635 636 #undef UNROLL 637 } 638 } 639 640 #define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon 641 #else 642 #define S32_Blend_BlitRow32_PROC NULL 643 #endif 644 645 /////////////////////////////////////////////////////////////////////////////// 646 647 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 648 649 #undef DEBUG_OPAQUE_DITHER 650 651 #if defined(DEBUG_OPAQUE_DITHER) 652 static void showme8(char *str, void *p, int len) 653 { 654 static char buf[256]; 655 char tbuf[32]; 656 int i; 657 char *pc = (char*) p; 658 sprintf(buf,"%8s:", str); 659 for(i=0;i<len;i++) { 660 sprintf(tbuf, " %02x", pc[i]); 661 strcat(buf, tbuf); 662 } 663 SkDebugf("%s\n", buf); 664 } 665 static void showme16(char *str, void *p, int len) 666 { 667 static char buf[256]; 668 char tbuf[32]; 669 int i; 670 uint16_t *pc = (uint16_t*) p; 671 sprintf(buf,"%8s:", str); 672 len = (len / sizeof(uint16_t)); /* passed as bytes */ 673 for(i=0;i<len;i++) { 674 sprintf(tbuf, " %04x", pc[i]); 675 strcat(buf, tbuf); 676 } 677 SkDebugf("%s\n", buf); 678 } 679 #endif 680 681 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 682 const SkPMColor* SK_RESTRICT src, 683 int count, U8CPU alpha, int x, int y) { 684 SkASSERT(255 == alpha); 685 686 #define UNROLL 8 687 688 if (count >= UNROLL) { 689 uint8x8_t dbase; 690 691 #if defined(DEBUG_OPAQUE_DITHER) 692 uint16_t tmpbuf[UNROLL]; 693 int td[UNROLL]; 694 int tdv[UNROLL]; 695 int ta[UNROLL]; 696 int tap[UNROLL]; 697 uint16_t in_dst[UNROLL]; 698 int offset = 0; 699 int noisy = 0; 700 #endif 701 702 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 703 dbase = vld1_u8(dstart); 704 705 do { 706 uint8x8_t sr, sg, sb, sa, d; 707 uint16x8_t dst8, scale8, alpha8; 708 uint16x8_t dst_r, dst_g, dst_b; 709 710 #if defined(DEBUG_OPAQUE_DITHER) 711 /* calculate 8 elements worth into a temp buffer */ 712 { 713 int my_y = y; 714 int my_x = x; 715 SkPMColor* my_src = (SkPMColor*)src; 716 uint16_t* my_dst = dst; 717 int i; 718 719 DITHER_565_SCAN(my_y); 720 for(i=0;i<UNROLL;i++) { 721 SkPMColor c = *my_src++; 722 SkPMColorAssert(c); 723 if (c) { 724 unsigned a = SkGetPackedA32(c); 725 726 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 727 tdv[i] = DITHER_VALUE(my_x); 728 ta[i] = a; 729 tap[i] = SkAlpha255To256(a); 730 td[i] = d; 731 732 unsigned sr = SkGetPackedR32(c); 733 unsigned sg = SkGetPackedG32(c); 734 unsigned sb = SkGetPackedB32(c); 735 sr = SkDITHER_R32_FOR_565(sr, d); 736 sg = SkDITHER_G32_FOR_565(sg, d); 737 sb = SkDITHER_B32_FOR_565(sb, d); 738 739 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 740 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 741 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 742 // now src and dst expanded are in g:11 r:10 x:1 b:10 743 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 744 td[i] = d; 745 746 } else { 747 tmpbuf[i] = *my_dst; 748 ta[i] = tdv[i] = td[i] = 0xbeef; 749 } 750 in_dst[i] = *my_dst; 751 my_dst += 1; 752 DITHER_INC_X(my_x); 753 } 754 } 755 #endif 756 757 /* source is in ABGR */ 758 { 759 register uint8x8_t d0 asm("d0"); 760 register uint8x8_t d1 asm("d1"); 761 register uint8x8_t d2 asm("d2"); 762 register uint8x8_t d3 asm("d3"); 763 764 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 765 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 766 : "r" (src) 767 ); 768 sr = d0; sg = d1; sb = d2; sa = d3; 769 } 770 771 /* calculate 'd', which will be 0..7 */ 772 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 773 #if 1 774 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 775 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 776 #else 777 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 778 #endif 779 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 780 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 781 782 /* sr = sr - (sr>>5) + d */ 783 /* watching for 8-bit overflow. d is 0..7; risky range of 784 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 785 * safe as long as we do ((sr-sr>>5) + d) */ 786 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 787 sr = vadd_u8(sr, d); 788 789 /* sb = sb - (sb>>5) + d */ 790 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 791 sb = vadd_u8(sb, d); 792 793 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 794 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 795 sg = vadd_u8(sg, vshr_n_u8(d,1)); 796 797 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 798 dst8 = vld1q_u16(dst); 799 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 800 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 801 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 802 803 /* blend */ 804 #if 1 805 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 806 /* originally 255-sa + 1 */ 807 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 808 #else 809 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 810 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 811 #endif 812 813 #if 1 814 /* combine the addq and mul, save 3 insns */ 815 scale8 = vshrq_n_u16(scale8, 3); 816 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 817 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 818 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 819 #else 820 /* known correct, but +3 insns over above */ 821 scale8 = vshrq_n_u16(scale8, 3); 822 dst_b = vmulq_u16(dst_b, scale8); 823 dst_g = vmulq_u16(dst_g, scale8); 824 dst_r = vmulq_u16(dst_r, scale8); 825 826 /* combine */ 827 /* NB: vshll widens, need to preserve those bits */ 828 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 829 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 830 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 831 #endif 832 833 /* repack to store */ 834 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 835 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 836 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 837 838 vst1q_u16(dst, dst8); 839 840 #if defined(DEBUG_OPAQUE_DITHER) 841 /* verify my 8 elements match the temp buffer */ 842 { 843 int i, bad=0; 844 static int invocation; 845 846 for (i=0;i<UNROLL;i++) 847 if (tmpbuf[i] != dst[i]) bad=1; 848 if (bad) { 849 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 850 invocation, offset); 851 SkDebugf(" alpha 0x%x\n", alpha); 852 for (i=0;i<UNROLL;i++) 853 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 854 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 855 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 856 857 showme16("alpha8", &alpha8, sizeof(alpha8)); 858 showme16("scale8", &scale8, sizeof(scale8)); 859 showme8("d", &d, sizeof(d)); 860 showme16("dst8", &dst8, sizeof(dst8)); 861 showme16("dst_b", &dst_b, sizeof(dst_b)); 862 showme16("dst_g", &dst_g, sizeof(dst_g)); 863 showme16("dst_r", &dst_r, sizeof(dst_r)); 864 showme8("sb", &sb, sizeof(sb)); 865 showme8("sg", &sg, sizeof(sg)); 866 showme8("sr", &sr, sizeof(sr)); 867 868 /* cop out */ 869 return; 870 } 871 offset += UNROLL; 872 invocation++; 873 } 874 #endif 875 876 dst += UNROLL; 877 src += UNROLL; 878 count -= UNROLL; 879 /* skip x += UNROLL, since it's unchanged mod-4 */ 880 } while (count >= UNROLL); 881 } 882 #undef UNROLL 883 884 /* residuals */ 885 if (count > 0) { 886 DITHER_565_SCAN(y); 887 do { 888 SkPMColor c = *src++; 889 SkPMColorAssert(c); 890 if (c) { 891 unsigned a = SkGetPackedA32(c); 892 893 // dither and alpha are just temporary variables to work-around 894 // an ICE in debug. 895 unsigned dither = DITHER_VALUE(x); 896 unsigned alpha = SkAlpha255To256(a); 897 int d = SkAlphaMul(dither, alpha); 898 899 unsigned sr = SkGetPackedR32(c); 900 unsigned sg = SkGetPackedG32(c); 901 unsigned sb = SkGetPackedB32(c); 902 sr = SkDITHER_R32_FOR_565(sr, d); 903 sg = SkDITHER_G32_FOR_565(sg, d); 904 sb = SkDITHER_B32_FOR_565(sb, d); 905 906 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 907 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 908 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 909 // now src and dst expanded are in g:11 r:10 x:1 b:10 910 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 911 } 912 dst += 1; 913 DITHER_INC_X(x); 914 } while (--count != 0); 915 } 916 } 917 918 #define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon 919 #else 920 #define S32A_D565_Opaque_Dither_PROC NULL 921 #endif 922 923 /////////////////////////////////////////////////////////////////////////////// 924 925 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 926 /* 2009/10/27: RBE says "a work in progress"; debugging says ok; 927 * speedup untested, but ARM version is 26 insns/iteration and 928 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 929 * which is 10x the native version; that's pure instruction counts, 930 * not accounting for any instruction or memory latencies. 931 */ 932 933 #undef DEBUG_S32_OPAQUE_DITHER 934 935 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 936 const SkPMColor* SK_RESTRICT src, 937 int count, U8CPU alpha, int x, int y) { 938 SkASSERT(255 == alpha); 939 940 #define UNROLL 8 941 if (count >= UNROLL) { 942 uint8x8_t d; 943 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 944 d = vld1_u8(dstart); 945 946 while (count >= UNROLL) { 947 uint8x8_t sr, sg, sb, sa; 948 uint16x8_t dr, dg, db, da; 949 uint16x8_t dst8; 950 951 /* source is in ABGR ordering (R == lsb) */ 952 { 953 register uint8x8_t d0 asm("d0"); 954 register uint8x8_t d1 asm("d1"); 955 register uint8x8_t d2 asm("d2"); 956 register uint8x8_t d3 asm("d3"); 957 958 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 959 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 960 : "r" (src) 961 ); 962 sr = d0; sg = d1; sb = d2; sa = d3; 963 } 964 /* XXX: if we want to prefetch, hide it in the above asm() 965 * using the gcc __builtin_prefetch(), the prefetch will 966 * fall to the bottom of the loop -- it won't stick up 967 * at the top of the loop, just after the vld4. 968 */ 969 970 /* sr = sr - (sr>>5) + d */ 971 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 972 dr = vaddl_u8(sr, d); 973 974 /* sb = sb - (sb>>5) + d */ 975 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 976 db = vaddl_u8(sb, d); 977 978 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 979 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 980 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 981 /* XXX: check that the "d>>1" here is hoisted */ 982 983 /* pack high bits of each into 565 format (rgb, b is lsb) */ 984 dst8 = vshrq_n_u16(db, 3); 985 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 986 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 987 988 /* store it */ 989 vst1q_u16(dst, dst8); 990 991 #if defined(DEBUG_S32_OPAQUE_DITHER) 992 /* always good to know if we generated good results */ 993 { 994 int i, myx = x, myy = y; 995 DITHER_565_SCAN(myy); 996 for (i=0;i<UNROLL;i++) { 997 SkPMColor c = src[i]; 998 unsigned dither = DITHER_VALUE(myx); 999 uint16_t val = SkDitherRGB32To565(c, dither); 1000 if (val != dst[i]) { 1001 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1002 c, dither, val, dst[i], dstart[i]); 1003 } 1004 DITHER_INC_X(myx); 1005 } 1006 } 1007 #endif 1008 1009 dst += UNROLL; 1010 src += UNROLL; 1011 count -= UNROLL; 1012 x += UNROLL; /* probably superfluous */ 1013 } 1014 } 1015 #undef UNROLL 1016 1017 /* residuals */ 1018 if (count > 0) { 1019 DITHER_565_SCAN(y); 1020 do { 1021 SkPMColor c = *src++; 1022 SkPMColorAssert(c); 1023 SkASSERT(SkGetPackedA32(c) == 255); 1024 1025 unsigned dither = DITHER_VALUE(x); 1026 *dst++ = SkDitherRGB32To565(c, dither); 1027 DITHER_INC_X(x); 1028 } while (--count != 0); 1029 } 1030 } 1031 1032 #define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon 1033 #else 1034 #define S32_D565_Opaque_Dither_PROC NULL 1035 #endif 1036 1037 /////////////////////////////////////////////////////////////////////////////// 1038 1039 static const SkBlitRow::Proc platform_565_procs[] = { 1040 // no dither 1041 S32_D565_Opaque_PROC, 1042 S32_D565_Blend_PROC, 1043 S32A_D565_Opaque_PROC, 1044 S32A_D565_Blend_PROC, 1045 1046 // dither 1047 S32_D565_Opaque_Dither_PROC, 1048 S32_D565_Blend_Dither_PROC, 1049 S32A_D565_Opaque_Dither_PROC, 1050 NULL, // S32A_D565_Blend_Dither 1051 }; 1052 1053 static const SkBlitRow::Proc platform_4444_procs[] = { 1054 // no dither 1055 NULL, // S32_D4444_Opaque, 1056 NULL, // S32_D4444_Blend, 1057 NULL, // S32A_D4444_Opaque, 1058 NULL, // S32A_D4444_Blend, 1059 1060 // dither 1061 NULL, // S32_D4444_Opaque_Dither, 1062 NULL, // S32_D4444_Blend_Dither, 1063 NULL, // S32A_D4444_Opaque_Dither, 1064 NULL, // S32A_D4444_Blend_Dither 1065 }; 1066 1067 static const SkBlitRow::Proc32 platform_32_procs[] = { 1068 NULL, // S32_Opaque, 1069 S32_Blend_BlitRow32_PROC, // S32_Blend, 1070 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, 1071 NULL, // S32A_Blend, 1072 }; 1073 1074 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { 1075 return platform_4444_procs[flags]; 1076 } 1077 1078 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 1079 return platform_565_procs[flags]; 1080 } 1081 1082 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 1083 return platform_32_procs[flags]; 1084 } 1085 1086