Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2015 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitMask.h"
      9 #include "SkColor_opts_neon.h"
     10 
     11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
     12                                         SkColor color, int width,
     13                                         SkPMColor opaqueDst) {
     14     int colR = SkColorGetR(color);
     15     int colG = SkColorGetG(color);
     16     int colB = SkColorGetB(color);
     17 
     18     uint8x8_t vcolR = vdup_n_u8(colR);
     19     uint8x8_t vcolG = vdup_n_u8(colG);
     20     uint8x8_t vcolB = vdup_n_u8(colB);
     21     uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
     22     uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
     23     uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
     24     uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
     25 
     26     while (width >= 8) {
     27         uint8x8x4_t vdst;
     28         uint16x8_t vmask;
     29         uint16x8_t vmaskR, vmaskG, vmaskB;
     30         uint8x8_t vsel_trans, vsel_opq;
     31 
     32         vdst = vld4_u8((uint8_t*)dst);
     33         vmask = vld1q_u16(src);
     34 
     35         // Prepare compare masks
     36         vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
     37         vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
     38 
     39         // Get all the color masks on 5 bits
     40         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
     41         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
     42                              SK_B16_BITS + SK_R16_BITS + 1);
     43         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
     44 
     45         // Upscale to 0..32
     46         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
     47         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
     48         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
     49 
     50         vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
     51         vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
     52 
     53         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
     54         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
     55         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
     56 
     57         vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
     58         vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
     59         vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
     60 
     61         vst4_u8((uint8_t*)dst, vdst);
     62 
     63         dst += 8;
     64         src += 8;
     65         width -= 8;
     66     }
     67 
     68     // Leftovers
     69     for (int i = 0; i < width; i++) {
     70         dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
     71                                     opaqueDst);
     72     }
     73 }
     74 
     75 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
     76                                    SkColor color, int width, SkPMColor) {
     77     int colA = SkColorGetA(color);
     78     int colR = SkColorGetR(color);
     79     int colG = SkColorGetG(color);
     80     int colB = SkColorGetB(color);
     81 
     82     colA = SkAlpha255To256(colA);
     83 
     84     uint16x8_t vcolA = vdupq_n_u16(colA);
     85     uint8x8_t vcolR = vdup_n_u8(colR);
     86     uint8x8_t vcolG = vdup_n_u8(colG);
     87     uint8x8_t vcolB = vdup_n_u8(colB);
     88 
     89     while (width >= 8) {
     90         uint8x8x4_t vdst;
     91         uint16x8_t vmask;
     92         uint16x8_t vmaskR, vmaskG, vmaskB;
     93 
     94         vdst = vld4_u8((uint8_t*)dst);
     95         vmask = vld1q_u16(src);
     96 
     97         // Get all the color masks on 5 bits
     98         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
     99         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
    100                              SK_B16_BITS + SK_R16_BITS + 1);
    101         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
    102 
    103         // Upscale to 0..32
    104         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
    105         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
    106         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
    107 
    108         vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
    109         vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
    110         vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
    111 
    112         vdst.val[NEON_A] = vdup_n_u8(0xFF);
    113         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
    114         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
    115         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
    116 
    117         vst4_u8((uint8_t*)dst, vdst);
    118 
    119         dst += 8;
    120         src += 8;
    121         width -= 8;
    122     }
    123 
    124     for (int i = 0; i < width; i++) {
    125         dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    126     }
    127 }
    128 
    129 #define LOAD_LANE_16(reg, n) \
    130     reg = vld1q_lane_u16(device, reg, n); \
    131     device = (uint16_t*)((char*)device + deviceRB);
    132 
    133 #define STORE_LANE_16(reg, n) \
    134     vst1_lane_u16(dst, reg, n); \
    135     dst = (uint16_t*)((char*)dst + deviceRB);
    136 
    137 void SkRGB16BlitterBlitV_neon(uint16_t* device,
    138                               int height,
    139                               size_t deviceRB,
    140                               unsigned scale,
    141                               uint32_t src32) {
    142     if (height >= 8)
    143     {
    144         uint16_t* dst = device;
    145 
    146         // prepare constants
    147         uint16x8_t vdev = vdupq_n_u16(0);
    148         uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
    149         uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
    150         uint32x4_t vsrc32 = vdupq_n_u32(src32);
    151         uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
    152 
    153         while (height >= 8){
    154             LOAD_LANE_16(vdev, 0)
    155             LOAD_LANE_16(vdev, 1)
    156             LOAD_LANE_16(vdev, 2)
    157             LOAD_LANE_16(vdev, 3)
    158             LOAD_LANE_16(vdev, 4)
    159             LOAD_LANE_16(vdev, 5)
    160             LOAD_LANE_16(vdev, 6)
    161             LOAD_LANE_16(vdev, 7)
    162 
    163             // Expand_rgb_16
    164             uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
    165             uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
    166             uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
    167 
    168             // Compact_rgb_16
    169             vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
    170             vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
    171             vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
    172             vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
    173 
    174             uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
    175             uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
    176             uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
    177             vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
    178             vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
    179             uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
    180 
    181             STORE_LANE_16(vdst16_lo, 0)
    182             STORE_LANE_16(vdst16_lo, 1)
    183             STORE_LANE_16(vdst16_lo, 2)
    184             STORE_LANE_16(vdst16_lo, 3)
    185             STORE_LANE_16(vdst16_hi, 0)
    186             STORE_LANE_16(vdst16_hi, 1)
    187             STORE_LANE_16(vdst16_hi, 2)
    188             STORE_LANE_16(vdst16_hi, 3)
    189             height -= 8;
    190         }
    191     }
    192     while (height != 0){
    193         uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
    194         *device = SkCompact_rgb_16((src32 + dst32) >> 5);
    195         device = (uint16_t*)((char*)device + deviceRB);
    196         height--;
    197     }
    198 }
    199 
    200 #undef LOAD_LANE_16
    201 #undef STORE_LANE_16
    202