1 /* 2 * Copyright 2011 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "Sk4px.h" 9 #include "SkBlitRow.h" 10 #include "SkColorData.h" 11 #include "SkOpts.h" 12 #include "SkUtils.h" 13 14 // Everyone agrees memcpy() is the best way to do this. 15 static void blit_row_s32_opaque(SkPMColor* dst, 16 const SkPMColor* src, 17 int count, 18 U8CPU alpha) { 19 SkASSERT(255 == alpha); 20 memcpy(dst, src, count * sizeof(SkPMColor)); 21 } 22 23 // We have SSE2, NEON, and portable implementations of 24 // blit_row_s32_blend() and blit_row_s32a_blend(). 25 26 // TODO(mtklein): can we do better in NEON than 2 pixels at a time? 27 28 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 29 #include <emmintrin.h> 30 31 static inline __m128i SkPMLerp_SSE2(const __m128i& src, 32 const __m128i& dst, 33 const unsigned src_scale) { 34 // Computes dst + (((src - dst)*src_scale)>>8) 35 const __m128i mask = _mm_set1_epi32(0x00FF00FF); 36 37 // Unpack the 16x8-bit source into 2 8x16-bit splayed halves. 38 __m128i src_rb = _mm_and_si128(mask, src); 39 __m128i src_ag = _mm_srli_epi16(src, 8); 40 __m128i dst_rb = _mm_and_si128(mask, dst); 41 __m128i dst_ag = _mm_srli_epi16(dst, 8); 42 43 // Compute scaled differences. 44 __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb); 45 __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag); 46 __m128i s = _mm_set1_epi16(src_scale); 47 diff_rb = _mm_mullo_epi16(diff_rb, s); 48 diff_ag = _mm_mullo_epi16(diff_ag, s); 49 50 // Pack the differences back together. 51 diff_rb = _mm_srli_epi16(diff_rb, 8); 52 diff_ag = _mm_andnot_si128(mask, diff_ag); 53 __m128i diff = _mm_or_si128(diff_rb, diff_ag); 54 55 // Add difference to destination. 56 return _mm_add_epi8(dst, diff); 57 } 58 59 60 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 61 SkASSERT(alpha <= 255); 62 63 auto src4 = (const __m128i*)src; 64 auto dst4 = ( __m128i*)dst; 65 66 while (count >= 4) { 67 _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4), 68 _mm_loadu_si128(dst4), 69 SkAlpha255To256(alpha))); 70 src4++; 71 dst4++; 72 count -= 4; 73 } 74 75 src = (const SkPMColor*)src4; 76 dst = ( SkPMColor*)dst4; 77 78 while (count --> 0) { 79 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); 80 src++; 81 dst++; 82 } 83 } 84 85 static inline __m128i SkBlendARGB32_SSE2(const __m128i& src, 86 const __m128i& dst, 87 const unsigned aa) { 88 unsigned alpha = SkAlpha255To256(aa); 89 __m128i src_scale = _mm_set1_epi16(alpha); 90 // SkAlphaMulInv256(SkGetPackedA32(src), src_scale) 91 __m128i dst_scale = _mm_srli_epi32(src, 24); 92 // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale. 93 dst_scale = _mm_mullo_epi16(dst_scale, src_scale); 94 dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale); 95 dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8)); 96 dst_scale = _mm_srli_epi32(dst_scale, 8); 97 // Duplicate scales into 2x16-bit pattern per pixel. 98 dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); 99 dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0)); 100 101 const __m128i mask = _mm_set1_epi32(0x00FF00FF); 102 103 // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves. 104 __m128i src_rb = _mm_and_si128(mask, src); 105 __m128i src_ag = _mm_srli_epi16(src, 8); 106 __m128i dst_rb = _mm_and_si128(mask, dst); 107 __m128i dst_ag = _mm_srli_epi16(dst, 8); 108 109 // Scale them. 110 src_rb = _mm_mullo_epi16(src_rb, src_scale); 111 src_ag = _mm_mullo_epi16(src_ag, src_scale); 112 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale); 113 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale); 114 115 // Add the scaled source and destination. 116 dst_rb = _mm_add_epi16(src_rb, dst_rb); 117 dst_ag = _mm_add_epi16(src_ag, dst_ag); 118 119 // Unsplay the halves back together. 120 dst_rb = _mm_srli_epi16(dst_rb, 8); 121 dst_ag = _mm_andnot_si128(mask, dst_ag); 122 return _mm_or_si128(dst_rb, dst_ag); 123 } 124 125 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 126 SkASSERT(alpha <= 255); 127 128 auto src4 = (const __m128i*)src; 129 auto dst4 = ( __m128i*)dst; 130 131 while (count >= 4) { 132 _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4), 133 _mm_loadu_si128(dst4), 134 alpha)); 135 src4++; 136 dst4++; 137 count -= 4; 138 } 139 140 src = (const SkPMColor*)src4; 141 dst = ( SkPMColor*)dst4; 142 143 while (count --> 0) { 144 *dst = SkBlendARGB32(*src, *dst, alpha); 145 src++; 146 dst++; 147 } 148 } 149 150 #elif defined(SK_ARM_HAS_NEON) 151 #include <arm_neon.h> 152 153 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 154 SkASSERT(alpha <= 255); 155 156 uint16_t src_scale = SkAlpha255To256(alpha); 157 uint16_t dst_scale = 256 - src_scale; 158 159 while (count >= 2) { 160 uint8x8_t vsrc, vdst, vres; 161 uint16x8_t vsrc_wide, vdst_wide; 162 163 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 164 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 165 166 vsrc_wide = vmovl_u8(vsrc); 167 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 168 169 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 170 171 vdst_wide += vsrc_wide; 172 vres = vshrn_n_u16(vdst_wide, 8); 173 174 vst1_u32(dst, vreinterpret_u32_u8(vres)); 175 176 src += 2; 177 dst += 2; 178 count -= 2; 179 } 180 181 if (count == 1) { 182 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 183 uint16x8_t vsrc_wide, vdst_wide; 184 185 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 186 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 187 188 vsrc_wide = vmovl_u8(vsrc); 189 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 190 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 191 vdst_wide += vsrc_wide; 192 vres = vshrn_n_u16(vdst_wide, 8); 193 194 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 195 } 196 } 197 198 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 199 SkASSERT(alpha < 255); 200 201 unsigned alpha256 = SkAlpha255To256(alpha); 202 203 if (count & 1) { 204 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 205 uint16x8_t vdst_wide, vsrc_wide; 206 unsigned dst_scale; 207 208 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 209 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 210 211 dst_scale = vget_lane_u8(vsrc, 3); 212 dst_scale = SkAlphaMulInv256(dst_scale, alpha256); 213 214 vsrc_wide = vmovl_u8(vsrc); 215 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 216 217 vdst_wide = vmovl_u8(vdst); 218 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 219 220 vdst_wide += vsrc_wide; 221 vres = vshrn_n_u16(vdst_wide, 8); 222 223 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 224 dst++; 225 src++; 226 count--; 227 } 228 229 uint8x8_t alpha_mask; 230 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 231 alpha_mask = vld1_u8(alpha_mask_setup); 232 233 while (count) { 234 235 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 236 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 237 238 __builtin_prefetch(src+32); 239 __builtin_prefetch(dst+32); 240 241 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 242 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 243 244 vsrc_scale = vdupq_n_u16(alpha256); 245 246 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 247 vdst_scale = vmovl_u8(vsrc_alphas); 248 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). 249 // A 16-bit lane would overflow if we used 0xFFFF here, 250 // so use an approximation with 0xFF00 that is off by 1, 251 // and add back 1 after to get the correct value. 252 // This is valid if alpha256 <= 255. 253 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); 254 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); 255 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); 256 257 vsrc_wide = vmovl_u8(vsrc); 258 vsrc_wide *= vsrc_scale; 259 260 vdst_wide = vmovl_u8(vdst); 261 vdst_wide *= vdst_scale; 262 263 vdst_wide += vsrc_wide; 264 vres = vshrn_n_u16(vdst_wide, 8); 265 266 vst1_u32(dst, vreinterpret_u32_u8(vres)); 267 268 src += 2; 269 dst += 2; 270 count -= 2; 271 } 272 } 273 274 #else 275 static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 276 SkASSERT(alpha <= 255); 277 while (count --> 0) { 278 *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha)); 279 src++; 280 dst++; 281 } 282 } 283 284 static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) { 285 SkASSERT(alpha <= 255); 286 while (count --> 0) { 287 *dst = SkBlendARGB32(*src, *dst, alpha); 288 src++; 289 dst++; 290 } 291 } 292 #endif 293 294 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { 295 static const SkBlitRow::Proc32 kProcs[] = { 296 blit_row_s32_opaque, 297 blit_row_s32_blend, 298 nullptr, // blit_row_s32a_opaque is in SkOpts 299 blit_row_s32a_blend 300 }; 301 302 SkASSERT(flags < SK_ARRAY_COUNT(kProcs)); 303 flags &= SK_ARRAY_COUNT(kProcs) - 1; // just to be safe 304 305 return flags == 2 ? SkOpts::blit_row_s32a_opaque 306 : kProcs[flags]; 307 } 308 309 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) { 310 switch (SkGetPackedA32(color)) { 311 case 0: memmove(dst, src, count * sizeof(SkPMColor)); return; 312 case 255: sk_memset32(dst, color, count); return; 313 } 314 315 unsigned invA = 255 - SkGetPackedA32(color); 316 invA += invA >> 7; 317 SkASSERT(invA < 256); // We've should have already handled alpha == 0 externally. 318 319 Sk16h colorHighAndRound = (Sk4px::DupPMColor(color).widen() << 8) + Sk16h(128); 320 Sk16b invA_16x(invA); 321 322 Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px { 323 return (src4 * invA_16x).addNarrowHi(colorHighAndRound); 324 }); 325 } 326