Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2015 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitMask.h"
      9 #include "SkColor_opts_neon.h"
     10 
     11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
     12                                         SkColor color, int width,
     13                                         SkPMColor opaqueDst) {
     14     int colR = SkColorGetR(color);
     15     int colG = SkColorGetG(color);
     16     int colB = SkColorGetB(color);
     17 
     18     uint8x8_t vcolR, vcolG, vcolB;
     19     uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
     20 
     21     if (width >= 8) {
     22         vcolR = vdup_n_u8(colR);
     23         vcolG = vdup_n_u8(colG);
     24         vcolB = vdup_n_u8(colB);
     25         vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
     26         vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
     27         vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
     28         vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
     29     }
     30 
     31     while (width >= 8) {
     32         uint8x8x4_t vdst;
     33         uint16x8_t vmask;
     34         uint16x8_t vmaskR, vmaskG, vmaskB;
     35         uint8x8_t vsel_trans, vsel_opq;
     36 
     37         vdst = vld4_u8((uint8_t*)dst);
     38         vmask = vld1q_u16(src);
     39 
     40         // Prepare compare masks
     41         vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
     42         vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
     43 
     44         // Get all the color masks on 5 bits
     45         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
     46         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
     47                              SK_B16_BITS + SK_R16_BITS + 1);
     48         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
     49 
     50         // Upscale to 0..32
     51         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
     52         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
     53         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
     54 
     55         vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
     56         vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
     57 
     58         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
     59         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
     60         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
     61 
     62         vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
     63         vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
     64         vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
     65 
     66         vst4_u8((uint8_t*)dst, vdst);
     67 
     68         dst += 8;
     69         src += 8;
     70         width -= 8;
     71     }
     72 
     73     // Leftovers
     74     for (int i = 0; i < width; i++) {
     75         dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
     76                                     opaqueDst);
     77     }
     78 }
     79 
     80 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
     81                                    SkColor color, int width, SkPMColor) {
     82     int colA = SkColorGetA(color);
     83     int colR = SkColorGetR(color);
     84     int colG = SkColorGetG(color);
     85     int colB = SkColorGetB(color);
     86 
     87     colA = SkAlpha255To256(colA);
     88 
     89     uint8x8_t vcolR, vcolG, vcolB;
     90     uint16x8_t vcolA;
     91 
     92     if (width >= 8) {
     93         vcolA = vdupq_n_u16(colA);
     94         vcolR = vdup_n_u8(colR);
     95         vcolG = vdup_n_u8(colG);
     96         vcolB = vdup_n_u8(colB);
     97     }
     98 
     99     while (width >= 8) {
    100         uint8x8x4_t vdst;
    101         uint16x8_t vmask;
    102         uint16x8_t vmaskR, vmaskG, vmaskB;
    103 
    104         vdst = vld4_u8((uint8_t*)dst);
    105         vmask = vld1q_u16(src);
    106 
    107         // Get all the color masks on 5 bits
    108         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
    109         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
    110                              SK_B16_BITS + SK_R16_BITS + 1);
    111         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
    112 
    113         // Upscale to 0..32
    114         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
    115         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
    116         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
    117 
    118         vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
    119         vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
    120         vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
    121 
    122         vdst.val[NEON_A] = vdup_n_u8(0xFF);
    123         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
    124         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
    125         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
    126 
    127         vst4_u8((uint8_t*)dst, vdst);
    128 
    129         dst += 8;
    130         src += 8;
    131         width -= 8;
    132     }
    133 
    134     for (int i = 0; i < width; i++) {
    135         dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    136     }
    137 }
    138 
    139 #define LOAD_LANE_16(reg, n) \
    140     reg = vld1q_lane_u16(device, reg, n); \
    141     device = (uint16_t*)((char*)device + deviceRB);
    142 
    143 #define STORE_LANE_16(reg, n) \
    144     vst1_lane_u16(dst, reg, n); \
    145     dst = (uint16_t*)((char*)dst + deviceRB);
    146 
    147 void SkRGB16BlitterBlitV_neon(uint16_t* device,
    148                               int height,
    149                               size_t deviceRB,
    150                               unsigned scale,
    151                               uint32_t src32) {
    152     if (height >= 8)
    153     {
    154         uint16_t* dst = device;
    155 
    156         // prepare constants
    157         uint16x8_t vdev = vdupq_n_u16(0);
    158         uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
    159         uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
    160         uint32x4_t vsrc32 = vdupq_n_u32(src32);
    161         uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
    162 
    163         while (height >= 8){
    164             LOAD_LANE_16(vdev, 0)
    165             LOAD_LANE_16(vdev, 1)
    166             LOAD_LANE_16(vdev, 2)
    167             LOAD_LANE_16(vdev, 3)
    168             LOAD_LANE_16(vdev, 4)
    169             LOAD_LANE_16(vdev, 5)
    170             LOAD_LANE_16(vdev, 6)
    171             LOAD_LANE_16(vdev, 7)
    172 
    173             // Expand_rgb_16
    174             uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
    175             uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
    176             uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
    177 
    178             // Compact_rgb_16
    179             vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
    180             vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
    181             vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
    182             vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
    183 
    184             uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
    185             uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
    186             uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
    187             vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
    188             vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
    189             uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
    190 
    191             STORE_LANE_16(vdst16_lo, 0)
    192             STORE_LANE_16(vdst16_lo, 1)
    193             STORE_LANE_16(vdst16_lo, 2)
    194             STORE_LANE_16(vdst16_lo, 3)
    195             STORE_LANE_16(vdst16_hi, 0)
    196             STORE_LANE_16(vdst16_hi, 1)
    197             STORE_LANE_16(vdst16_hi, 2)
    198             STORE_LANE_16(vdst16_hi, 3)
    199             height -= 8;
    200         }
    201     }
    202     while (height != 0){
    203         uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
    204         *device = SkCompact_rgb_16((src32 + dst32) >> 5);
    205         device = (uint16_t*)((char*)device + deviceRB);
    206         height--;
    207     }
    208 }
    209 
    210 #undef LOAD_LANE_16
    211 #undef STORE_LANE_16
    212