1 /* 2 * Copyright 2015 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitMask.h" 9 #include "SkColor_opts_neon.h" 10 11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], 12 SkColor color, int width, 13 SkPMColor opaqueDst) { 14 int colR = SkColorGetR(color); 15 int colG = SkColorGetG(color); 16 int colB = SkColorGetB(color); 17 18 uint8x8_t vcolR, vcolG, vcolB; 19 uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; 20 21 if (width >= 8) { 22 vcolR = vdup_n_u8(colR); 23 vcolG = vdup_n_u8(colG); 24 vcolB = vdup_n_u8(colB); 25 vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); 26 vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); 27 vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); 28 vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); 29 } 30 31 while (width >= 8) { 32 uint8x8x4_t vdst; 33 uint16x8_t vmask; 34 uint16x8_t vmaskR, vmaskG, vmaskB; 35 uint8x8_t vsel_trans, vsel_opq; 36 37 vdst = vld4_u8((uint8_t*)dst); 38 vmask = vld1q_u16(src); 39 40 // Prepare compare masks 41 vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); 42 vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); 43 44 // Get all the color masks on 5 bits 45 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); 46 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), 47 SK_B16_BITS + SK_R16_BITS + 1); 48 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); 49 50 // Upscale to 0..32 51 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); 52 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); 53 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); 54 55 vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); 56 vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); 57 58 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); 59 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); 60 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); 61 62 vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); 63 vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); 64 vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); 65 66 vst4_u8((uint8_t*)dst, vdst); 67 68 dst += 8; 69 src += 8; 70 width -= 8; 71 } 72 73 // Leftovers 74 for (int i = 0; i < width; i++) { 75 dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], 76 opaqueDst); 77 } 78 } 79 80 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], 81 SkColor color, int width, SkPMColor) { 82 int colA = SkColorGetA(color); 83 int colR = SkColorGetR(color); 84 int colG = SkColorGetG(color); 85 int colB = SkColorGetB(color); 86 87 colA = SkAlpha255To256(colA); 88 89 uint8x8_t vcolR, vcolG, vcolB; 90 uint16x8_t vcolA; 91 92 if (width >= 8) { 93 vcolA = vdupq_n_u16(colA); 94 vcolR = vdup_n_u8(colR); 95 vcolG = vdup_n_u8(colG); 96 vcolB = vdup_n_u8(colB); 97 } 98 99 while (width >= 8) { 100 uint8x8x4_t vdst; 101 uint16x8_t vmask; 102 uint16x8_t vmaskR, vmaskG, vmaskB; 103 104 vdst = vld4_u8((uint8_t*)dst); 105 vmask = vld1q_u16(src); 106 107 // Get all the color masks on 5 bits 108 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); 109 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), 110 SK_B16_BITS + SK_R16_BITS + 1); 111 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); 112 113 // Upscale to 0..32 114 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); 115 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); 116 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); 117 118 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); 119 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); 120 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); 121 122 vdst.val[NEON_A] = vdup_n_u8(0xFF); 123 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); 124 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); 125 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); 126 127 vst4_u8((uint8_t*)dst, vdst); 128 129 dst += 8; 130 src += 8; 131 width -= 8; 132 } 133 134 for (int i = 0; i < width; i++) { 135 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); 136 } 137 } 138 139 #define LOAD_LANE_16(reg, n) \ 140 reg = vld1q_lane_u16(device, reg, n); \ 141 device = (uint16_t*)((char*)device + deviceRB); 142 143 #define STORE_LANE_16(reg, n) \ 144 vst1_lane_u16(dst, reg, n); \ 145 dst = (uint16_t*)((char*)dst + deviceRB); 146 147 void SkRGB16BlitterBlitV_neon(uint16_t* device, 148 int height, 149 size_t deviceRB, 150 unsigned scale, 151 uint32_t src32) { 152 if (height >= 8) 153 { 154 uint16_t* dst = device; 155 156 // prepare constants 157 uint16x8_t vdev = vdupq_n_u16(0); 158 uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE); 159 uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE); 160 uint32x4_t vsrc32 = vdupq_n_u32(src32); 161 uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale); 162 163 while (height >= 8){ 164 LOAD_LANE_16(vdev, 0) 165 LOAD_LANE_16(vdev, 1) 166 LOAD_LANE_16(vdev, 2) 167 LOAD_LANE_16(vdev, 3) 168 LOAD_LANE_16(vdev, 4) 169 LOAD_LANE_16(vdev, 5) 170 LOAD_LANE_16(vdev, 6) 171 LOAD_LANE_16(vdev, 7) 172 173 // Expand_rgb_16 174 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16)); 175 uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5); 176 uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5); 177 178 // Compact_rgb_16 179 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32); 180 vdst32_hi = vaddq_u32(vdst32_hi, vsrc32); 181 vdst32_lo = vshrq_n_u32(vdst32_lo, 5); 182 vdst32_hi = vshrq_n_u32(vdst32_hi, 5); 183 184 uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16); 185 uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16); 186 uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi); 187 vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16); 188 vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16); 189 uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi); 190 191 STORE_LANE_16(vdst16_lo, 0) 192 STORE_LANE_16(vdst16_lo, 1) 193 STORE_LANE_16(vdst16_lo, 2) 194 STORE_LANE_16(vdst16_lo, 3) 195 STORE_LANE_16(vdst16_hi, 0) 196 STORE_LANE_16(vdst16_hi, 1) 197 STORE_LANE_16(vdst16_hi, 2) 198 STORE_LANE_16(vdst16_hi, 3) 199 height -= 8; 200 } 201 } 202 while (height != 0){ 203 uint32_t dst32 = SkExpand_rgb_16(*device) * scale; 204 *device = SkCompact_rgb_16((src32 + dst32) >> 5); 205 device = (uint16_t*)((char*)device + deviceRB); 206 height--; 207 } 208 } 209 210 #undef LOAD_LANE_16 211 #undef STORE_LANE_16 212