Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2014 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <arm_neon.h>
      9 
     10 #define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
     11 #define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
     12 #define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
     13 #define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
     14 #define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
     15 #define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
     16 
     17 #define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
     18 #define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
     19 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
     20 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
     21 
     22 #ifndef PREAMBLE
     23     #define PREAMBLE(state)
     24     #define PREAMBLE_PARAM_X
     25     #define PREAMBLE_PARAM_Y
     26     #define PREAMBLE_ARG_X
     27     #define PREAMBLE_ARG_Y
     28 #endif
     29 
     30 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
     31                                 uint32_t xy[], int count, int x, int y) {
     32     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
     33                              SkMatrix::kScale_Mask)) == 0);
     34 
     35     PREAMBLE(s);
     36 
     37     // we store y, x, x, x, x, x
     38     const unsigned maxX = s.fPixmap.width() - 1;
     39     SkFractionalInt fx;
     40     {
     41         const SkBitmapProcStateAutoMapper mapper(s, x, y);
     42         const unsigned maxY = s.fPixmap.height() - 1;
     43         *xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
     44         fx = mapper.fractionalIntX();
     45     }
     46 
     47     if (0 == maxX) {
     48         // all of the following X values must be 0
     49         memset(xy, 0, count * sizeof(uint16_t));
     50         return;
     51     }
     52 
     53     const SkFractionalInt dx = s.fInvSxFractionalInt;
     54 
     55 #ifdef CHECK_FOR_DECAL
     56     // test if we don't need to apply the tile proc
     57     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
     58         decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
     59                              SkFractionalIntToFixed(dx), count);
     60         return;
     61     }
     62 #endif
     63 
     64     if (count >= 8) {
     65         SkFractionalInt dx2 = dx+dx;
     66         SkFractionalInt dx4 = dx2+dx2;
     67         SkFractionalInt dx8 = dx4+dx4;
     68 
     69         // now build fx/fx+dx/fx+2dx/fx+3dx
     70         SkFractionalInt fx1, fx2, fx3;
     71         int32x4_t lbase, hbase;
     72         int16_t *dst16 = (int16_t *)xy;
     73 
     74         fx1 = fx+dx;
     75         fx2 = fx1+dx;
     76         fx3 = fx2+dx;
     77 
     78         lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
     79         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
     80         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
     81         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
     82         hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
     83 
     84         // store & bump
     85         while (count >= 8) {
     86 
     87             int16x8_t fx8;
     88 
     89             fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
     90 
     91             vst1q_s16(dst16, fx8);
     92 
     93             // but preserving base & on to the next
     94             lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
     95             hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
     96             dst16 += 8;
     97             count -= 8;
     98             fx += dx8;
     99         };
    100         xy = (uint32_t *) dst16;
    101     }
    102 
    103     uint16_t* xx = (uint16_t*)xy;
    104     for (int i = count; i > 0; --i) {
    105         *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
    106         fx += dx;
    107     }
    108 }
    109 
    110 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
    111                                  uint32_t xy[], int count, int x, int y) {
    112     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
    113     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    114                              SkMatrix::kScale_Mask |
    115                              SkMatrix::kAffine_Mask)) == 0);
    116 
    117     PREAMBLE(s);
    118     const SkBitmapProcStateAutoMapper mapper(s, x, y);
    119 
    120     SkFractionalInt fx = mapper.fractionalIntX();
    121     SkFractionalInt fy = mapper.fractionalIntY();
    122     SkFractionalInt dx = s.fInvSxFractionalInt;
    123     SkFractionalInt dy = s.fInvKyFractionalInt;
    124     int maxX = s.fPixmap.width() - 1;
    125     int maxY = s.fPixmap.height() - 1;
    126 
    127     if (count >= 8) {
    128         SkFractionalInt dx4 = dx * 4;
    129         SkFractionalInt dy4 = dy * 4;
    130         SkFractionalInt dx8 = dx * 8;
    131         SkFractionalInt dy8 = dy * 8;
    132 
    133         int32x4_t xbase, ybase;
    134         int32x4_t x2base, y2base;
    135         int16_t *dst16 = (int16_t *) xy;
    136 
    137         // now build fx, fx+dx, fx+2dx, fx+3dx
    138         xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
    139         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
    140         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
    141         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
    142 
    143         // same for fy
    144         ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
    145         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
    146         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
    147         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
    148 
    149         x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
    150         y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
    151 
    152         // store & bump
    153         do {
    154             int16x8x2_t hi16;
    155 
    156             hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
    157             hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
    158 
    159             vst2q_s16(dst16, hi16);
    160 
    161             // moving base and on to the next
    162             xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    163             ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
    164             x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    165             y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
    166 
    167             dst16 += 16; // 8x32 aka 16x16
    168             count -= 8;
    169             fx += dx8;
    170             fy += dy8;
    171         } while (count >= 8);
    172         xy = (uint32_t *) dst16;
    173     }
    174 
    175     for (int i = count; i > 0; --i) {
    176         *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
    177                  TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
    178         fx += dx; fy += dy;
    179     }
    180 }
    181 
    182 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
    183                                 uint32_t* SK_RESTRICT xy,
    184                                 int count, int x, int y) {
    185     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
    186 
    187     PREAMBLE(s);
    188     // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
    189     int maxX = s.fPixmap.width() - 1;
    190     int maxY = s.fPixmap.height() - 1;
    191 
    192     SkPerspIter iter(s.fInvMatrix,
    193                      SkIntToScalar(x) + SK_ScalarHalf,
    194                      SkIntToScalar(y) + SK_ScalarHalf, count);
    195 
    196     while ((count = iter.next()) != 0) {
    197         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
    198 
    199         if (count >= 8) {
    200             int32_t *mysrc = (int32_t *) srcXY;
    201             int16_t *mydst = (int16_t *) xy;
    202             do {
    203                 int16x8x2_t hi16;
    204                 int32x4x2_t xy1, xy2;
    205 
    206                 xy1 = vld2q_s32(mysrc);
    207                 xy2 = vld2q_s32(mysrc+8);
    208 
    209                 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
    210                 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
    211 
    212                 vst2q_s16(mydst, hi16);
    213 
    214                 count -= 8;  // 8 iterations
    215                 mysrc += 16; // 16 longs
    216                 mydst += 16; // 16 shorts, aka 8 longs
    217             } while (count >= 8);
    218             // get xy and srcXY fixed up
    219             srcXY = (const SkFixed *) mysrc;
    220             xy = (uint32_t *) mydst;
    221         }
    222 
    223         while (--count >= 0) {
    224             *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
    225                      TILEX_PROCF(srcXY[0], maxX);
    226             srcXY += 2;
    227         }
    228     }
    229 }
    230 
    231 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
    232                                           SkFixed one PREAMBLE_PARAM_Y) {
    233     unsigned i = TILEY_PROCF(f, max);
    234     i = (i << 4) | TILEY_LOW_BITS(f, max);
    235     return (i << 14) | (TILEY_PROCF((f + one), max));
    236 }
    237 
    238 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
    239                                           SkFixed one PREAMBLE_PARAM_X) {
    240     unsigned i = TILEX_PROCF(f, max);
    241     i = (i << 4) | TILEX_LOW_BITS(f, max);
    242     return (i << 14) | (TILEX_PROCF((f + one), max));
    243 }
    244 
    245 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
    246                                           SkFixed one PREAMBLE_PARAM_X) {
    247     int32x4_t ret, res, wide_one;
    248 
    249     // Prepare constants
    250     wide_one = vdupq_n_s32(one);
    251 
    252     // Step 1
    253     res = TILEX_PROCF_NEON4(f, max);
    254 
    255     // Step 2
    256     ret = TILEX_LOW_BITS_NEON4(f, max);
    257     ret = vsliq_n_s32(ret, res, 4);
    258 
    259     // Step 3
    260     res = TILEX_PROCF_NEON4(f + wide_one, max);
    261     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
    262 
    263     return ret;
    264 }
    265 
    266 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
    267                                           SkFixed one PREAMBLE_PARAM_X) {
    268     int32x4_t ret, res, wide_one;
    269 
    270     // Prepare constants
    271     wide_one = vdupq_n_s32(one);
    272 
    273     // Step 1
    274     res = TILEY_PROCF_NEON4(f, max);
    275 
    276     // Step 2
    277     ret = TILEY_LOW_BITS_NEON4(f, max);
    278     ret = vsliq_n_s32(ret, res, 4);
    279 
    280     // Step 3
    281     res = TILEY_PROCF_NEON4(f + wide_one, max);
    282     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
    283 
    284     return ret;
    285 }
    286 
    287 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
    288                               uint32_t xy[], int count, int x, int y) {
    289     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    290                              SkMatrix::kScale_Mask)) == 0);
    291     SkASSERT(s.fInvKy == 0);
    292 
    293     PREAMBLE(s);
    294 
    295     const unsigned maxX = s.fPixmap.width() - 1;
    296     const SkFixed one = s.fFilterOneX;
    297     const SkFractionalInt dx = s.fInvSxFractionalInt;
    298     SkFractionalInt fx;
    299 
    300     {
    301         const SkBitmapProcStateAutoMapper mapper(s, x, y);
    302         const SkFixed fy = mapper.fixedY();
    303         const unsigned maxY = s.fPixmap.height() - 1;
    304         // compute our two Y values up front
    305         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
    306         // now initialize fx
    307         fx = mapper.fractionalIntX();
    308     }
    309 
    310 #ifdef CHECK_FOR_DECAL
    311     // test if we don't need to apply the tile proc
    312     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
    313         decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
    314                              SkFractionalIntToFixed(dx), count);
    315         return;
    316     }
    317 #endif
    318     {
    319 
    320     if (count >= 4) {
    321         int32x4_t wide_fx;
    322 
    323         wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
    324         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
    325         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
    326         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
    327 
    328         while (count >= 4) {
    329             int32x4_t res;
    330 
    331             res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
    332 
    333             vst1q_u32(xy, vreinterpretq_u32_s32(res));
    334 
    335             wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
    336             fx += dx+dx+dx+dx;
    337             xy += 4;
    338             count -= 4;
    339         }
    340     }
    341 
    342     while (--count >= 0) {
    343         *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
    344         fx += dx;
    345     }
    346 
    347     }
    348 }
    349 
    350 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
    351                                uint32_t xy[], int count, int x, int y) {
    352     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
    353     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    354                              SkMatrix::kScale_Mask |
    355                              SkMatrix::kAffine_Mask)) == 0);
    356 
    357     PREAMBLE(s);
    358     const SkBitmapProcStateAutoMapper mapper(s, x, y);
    359 
    360     SkFixed oneX = s.fFilterOneX;
    361     SkFixed oneY = s.fFilterOneY;
    362     SkFixed fx = mapper.fixedX();
    363     SkFixed fy = mapper.fixedY();
    364     SkFixed dx = s.fInvSx;
    365     SkFixed dy = s.fInvKy;
    366     unsigned maxX = s.fPixmap.width() - 1;
    367     unsigned maxY = s.fPixmap.height() - 1;
    368 
    369     if (count >= 4) {
    370         int32x4_t wide_fy, wide_fx;
    371 
    372         wide_fx = vdupq_n_s32(fx);
    373         wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
    374         wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
    375         wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
    376 
    377         wide_fy = vdupq_n_s32(fy);
    378         wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
    379         wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
    380         wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
    381 
    382         while (count >= 4) {
    383             int32x4x2_t vxy;
    384 
    385             // do the X side, then the Y side, then interleave them
    386             vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
    387             vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
    388 
    389             // interleave as YXYXYXYX as part of the storing
    390             vst2q_s32((int32_t*)xy, vxy);
    391 
    392             // prepare next iteration
    393             wide_fx += vdupq_n_s32(dx+dx+dx+dx);
    394             fx += dx + dx + dx + dx;
    395             wide_fy += vdupq_n_s32(dy+dy+dy+dy);
    396             fy += dy+dy+dy+dy;
    397             xy += 8; // 4 x's, 4 y's
    398             count -= 4;
    399         }
    400     }
    401 
    402     while (--count >= 0) {
    403         // NB: writing Y/X
    404         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
    405         fy += dy;
    406         *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
    407         fx += dx;
    408     }
    409 }
    410 
    411 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
    412                               uint32_t* SK_RESTRICT xy, int count,
    413                               int x, int y) {
    414     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
    415 
    416     PREAMBLE(s);
    417     unsigned maxX = s.fPixmap.width() - 1;
    418     unsigned maxY = s.fPixmap.height() - 1;
    419     SkFixed oneX = s.fFilterOneX;
    420     SkFixed oneY = s.fFilterOneY;
    421 
    422     SkPerspIter iter(s.fInvMatrix,
    423                      SkIntToScalar(x) + SK_ScalarHalf,
    424                      SkIntToScalar(y) + SK_ScalarHalf, count);
    425 
    426     while ((count = iter.next()) != 0) {
    427         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
    428 
    429         while (count >= 4) {
    430             int32x4_t wide_x, wide_y;
    431             int32x4x2_t vxy, vresyx;
    432 
    433             // load src:  x-y-x-y-x-y-x-y
    434             vxy = vld2q_s32(srcXY);
    435 
    436             // do the X side, then the Y side, then interleave them
    437             wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
    438             wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
    439 
    440             vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
    441             vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
    442 
    443             // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
    444             vst2q_s32((int32_t*)xy, vresyx);
    445 
    446             // on to the next iteration
    447             srcXY += 2*4;
    448             count -= 4;
    449             xy += 2*4;
    450         }
    451 
    452         while (--count >= 0) {
    453             // NB: we read x/y, we write y/x
    454             *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
    455                                        oneY PREAMBLE_ARG_Y);
    456             *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
    457                                        oneX PREAMBLE_ARG_X);
    458             srcXY += 2;
    459         }
    460     }
    461 }
    462 
    463 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
    464     SCALE_NOFILTER_NAME,
    465     SCALE_FILTER_NAME,
    466     AFFINE_NOFILTER_NAME,
    467     AFFINE_FILTER_NAME,
    468     PERSP_NOFILTER_NAME,
    469     PERSP_FILTER_NAME
    470 };
    471 
    472 #undef TILEX_PROCF_NEON8
    473 #undef TILEY_PROCF_NEON8
    474 #undef TILEX_PROCF_NEON4
    475 #undef TILEY_PROCF_NEON4
    476 #undef TILEX_LOW_BITS_NEON4
    477 #undef TILEY_LOW_BITS_NEON4
    478 
    479 #undef MAKENAME
    480 #undef TILEX_PROCF
    481 #undef TILEY_PROCF
    482 #ifdef CHECK_FOR_DECAL
    483     #undef CHECK_FOR_DECAL
    484 #endif
    485 
    486 #undef SCALE_NOFILTER_NAME
    487 #undef SCALE_FILTER_NAME
    488 #undef AFFINE_NOFILTER_NAME
    489 #undef AFFINE_FILTER_NAME
    490 #undef PERSP_NOFILTER_NAME
    491 #undef PERSP_FILTER_NAME
    492 
    493 #undef PREAMBLE
    494 #undef PREAMBLE_PARAM_X
    495 #undef PREAMBLE_PARAM_Y
    496 #undef PREAMBLE_ARG_X
    497 #undef PREAMBLE_ARG_Y
    498 
    499 #undef TILEX_LOW_BITS
    500 #undef TILEY_LOW_BITS
    501