Home | History | Annotate | Download | only in opts
      1 
      2 #include <arm_neon.h>
      3 
      4 
      5 #define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
      6 #define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
      7 #define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
      8 #define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
      9 #define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
     10 #define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
     11 
     12 #define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
     13 #define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
     14 #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
     15 #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
     16 
     17 #ifndef PREAMBLE
     18     #define PREAMBLE(state)
     19     #define PREAMBLE_PARAM_X
     20     #define PREAMBLE_PARAM_Y
     21     #define PREAMBLE_ARG_X
     22     #define PREAMBLE_ARG_Y
     23 #endif
     24 
     25 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
     26                                 uint32_t xy[], int count, int x, int y) {
     27     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
     28                              SkMatrix::kScale_Mask)) == 0);
     29 
     30     PREAMBLE(s);
     31 
     32     // we store y, x, x, x, x, x
     33     const unsigned maxX = s.fBitmap->width() - 1;
     34     SkFractionalInt fx;
     35     {
     36         SkPoint pt;
     37         s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
     38                                  SkIntToScalar(y) + SK_ScalarHalf, &pt);
     39         fx = SkScalarToFractionalInt(pt.fY);
     40         const unsigned maxY = s.fBitmap->height() - 1;
     41         *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
     42         fx = SkScalarToFractionalInt(pt.fX);
     43     }
     44 
     45     if (0 == maxX) {
     46         // all of the following X values must be 0
     47         memset(xy, 0, count * sizeof(uint16_t));
     48         return;
     49     }
     50 
     51     const SkFractionalInt dx = s.fInvSxFractionalInt;
     52 
     53 #ifdef CHECK_FOR_DECAL
     54     // test if we don't need to apply the tile proc
     55     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
     56         decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
     57                              SkFractionalIntToFixed(dx), count);
     58         return;
     59     }
     60 #endif
     61 
     62     if (count >= 8) {
     63         SkFractionalInt dx2 = dx+dx;
     64         SkFractionalInt dx4 = dx2+dx2;
     65         SkFractionalInt dx8 = dx4+dx4;
     66 
     67         // now build fx/fx+dx/fx+2dx/fx+3dx
     68         SkFractionalInt fx1, fx2, fx3;
     69         int32x4_t lbase, hbase;
     70         int16_t *dst16 = (int16_t *)xy;
     71 
     72         fx1 = fx+dx;
     73         fx2 = fx1+dx;
     74         fx3 = fx2+dx;
     75 
     76         lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
     77         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
     78         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
     79         lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
     80         hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
     81 
     82         // store & bump
     83         while (count >= 8) {
     84 
     85             int16x8_t fx8;
     86 
     87             fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
     88 
     89             vst1q_s16(dst16, fx8);
     90 
     91             // but preserving base & on to the next
     92             lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
     93             hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
     94             dst16 += 8;
     95             count -= 8;
     96             fx += dx8;
     97         };
     98         xy = (uint32_t *) dst16;
     99     }
    100 
    101     uint16_t* xx = (uint16_t*)xy;
    102     for (int i = count; i > 0; --i) {
    103         *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
    104         fx += dx;
    105     }
    106 }
    107 
    108 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
    109                                  uint32_t xy[], int count, int x, int y) {
    110     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
    111     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    112                              SkMatrix::kScale_Mask |
    113                              SkMatrix::kAffine_Mask)) == 0);
    114 
    115     PREAMBLE(s);
    116     SkPoint srcPt;
    117     s.fInvProc(s.fInvMatrix,
    118                SkIntToScalar(x) + SK_ScalarHalf,
    119                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
    120 
    121     SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
    122     SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
    123     SkFractionalInt dx = s.fInvSxFractionalInt;
    124     SkFractionalInt dy = s.fInvKyFractionalInt;
    125     int maxX = s.fBitmap->width() - 1;
    126     int maxY = s.fBitmap->height() - 1;
    127 
    128     if (count >= 8) {
    129         SkFractionalInt dx4 = dx * 4;
    130         SkFractionalInt dy4 = dy * 4;
    131         SkFractionalInt dx8 = dx * 8;
    132         SkFractionalInt dy8 = dy * 8;
    133 
    134         int32x4_t xbase, ybase;
    135         int32x4_t x2base, y2base;
    136         int16_t *dst16 = (int16_t *) xy;
    137 
    138         // now build fx, fx+dx, fx+2dx, fx+3dx
    139         xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
    140         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
    141         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
    142         xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
    143 
    144         // same for fy
    145         ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
    146         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
    147         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
    148         ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
    149 
    150         x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
    151         y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
    152 
    153         // store & bump
    154         do {
    155             int16x8x2_t hi16;
    156 
    157             hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
    158             hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
    159 
    160             vst2q_s16(dst16, hi16);
    161 
    162             // moving base and on to the next
    163             xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    164             ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
    165             x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    166             y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
    167 
    168             dst16 += 16; // 8x32 aka 16x16
    169             count -= 8;
    170             fx += dx8;
    171             fy += dy8;
    172         } while (count >= 8);
    173         xy = (uint32_t *) dst16;
    174     }
    175 
    176     for (int i = count; i > 0; --i) {
    177         *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
    178                  TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
    179         fx += dx; fy += dy;
    180     }
    181 }
    182 
    183 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
    184                                 uint32_t* SK_RESTRICT xy,
    185                                 int count, int x, int y) {
    186     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
    187 
    188     PREAMBLE(s);
    189     // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
    190     int maxX = s.fBitmap->width() - 1;
    191     int maxY = s.fBitmap->height() - 1;
    192 
    193     SkPerspIter iter(s.fInvMatrix,
    194                      SkIntToScalar(x) + SK_ScalarHalf,
    195                      SkIntToScalar(y) + SK_ScalarHalf, count);
    196 
    197     while ((count = iter.next()) != 0) {
    198         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
    199 
    200         if (count >= 8) {
    201             int32_t *mysrc = (int32_t *) srcXY;
    202             int16_t *mydst = (int16_t *) xy;
    203             do {
    204                 int16x8x2_t hi16;
    205                 int32x4x2_t xy1, xy2;
    206 
    207                 xy1 = vld2q_s32(mysrc);
    208                 xy2 = vld2q_s32(mysrc+8);
    209 
    210                 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
    211                 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
    212 
    213                 vst2q_s16(mydst, hi16);
    214 
    215                 count -= 8;  // 8 iterations
    216                 mysrc += 16; // 16 longs
    217                 mydst += 16; // 16 shorts, aka 8 longs
    218             } while (count >= 8);
    219             // get xy and srcXY fixed up
    220             srcXY = (const SkFixed *) mysrc;
    221             xy = (uint32_t *) mydst;
    222         }
    223 
    224         while (--count >= 0) {
    225             *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
    226                      TILEX_PROCF(srcXY[0], maxX);
    227             srcXY += 2;
    228         }
    229     }
    230 }
    231 
    232 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
    233                                           SkFixed one PREAMBLE_PARAM_Y) {
    234     unsigned i = TILEY_PROCF(f, max);
    235     i = (i << 4) | TILEY_LOW_BITS(f, max);
    236     return (i << 14) | (TILEY_PROCF((f + one), max));
    237 }
    238 
    239 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
    240                                           SkFixed one PREAMBLE_PARAM_X) {
    241     unsigned i = TILEX_PROCF(f, max);
    242     i = (i << 4) | TILEX_LOW_BITS(f, max);
    243     return (i << 14) | (TILEX_PROCF((f + one), max));
    244 }
    245 
    246 static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
    247                                           SkFixed one PREAMBLE_PARAM_X) {
    248     int32x4_t ret, res, wide_one;
    249 
    250     // Prepare constants
    251     wide_one = vdupq_n_s32(one);
    252 
    253     // Step 1
    254     res = TILEX_PROCF_NEON4(f, max);
    255 
    256     // Step 2
    257     ret = TILEX_LOW_BITS_NEON4(f, max);
    258     ret = vsliq_n_s32(ret, res, 4);
    259 
    260     // Step 3
    261     res = TILEX_PROCF_NEON4(f + wide_one, max);
    262     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
    263 
    264     return ret;
    265 }
    266 
    267 static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
    268                                           SkFixed one PREAMBLE_PARAM_X) {
    269     int32x4_t ret, res, wide_one;
    270 
    271     // Prepare constants
    272     wide_one = vdupq_n_s32(one);
    273 
    274     // Step 1
    275     res = TILEY_PROCF_NEON4(f, max);
    276 
    277     // Step 2
    278     ret = TILEY_LOW_BITS_NEON4(f, max);
    279     ret = vsliq_n_s32(ret, res, 4);
    280 
    281     // Step 3
    282     res = TILEY_PROCF_NEON4(f + wide_one, max);
    283     ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
    284 
    285     return ret;
    286 }
    287 
    288 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
    289                               uint32_t xy[], int count, int x, int y) {
    290     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    291                              SkMatrix::kScale_Mask)) == 0);
    292     SkASSERT(s.fInvKy == 0);
    293 
    294     PREAMBLE(s);
    295 
    296     const unsigned maxX = s.fBitmap->width() - 1;
    297     const SkFixed one = s.fFilterOneX;
    298     const SkFractionalInt dx = s.fInvSxFractionalInt;
    299     SkFractionalInt fx;
    300 
    301     {
    302         SkPoint pt;
    303         s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
    304                                  SkIntToScalar(y) + SK_ScalarHalf, &pt);
    305         const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
    306         const unsigned maxY = s.fBitmap->height() - 1;
    307         // compute our two Y values up front
    308         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
    309         // now initialize fx
    310         fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
    311     }
    312 
    313 #ifdef CHECK_FOR_DECAL
    314     // test if we don't need to apply the tile proc
    315     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
    316         decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
    317                              SkFractionalIntToFixed(dx), count);
    318         return;
    319     }
    320 #endif
    321     {
    322 
    323     if (count >= 4) {
    324         int32x4_t wide_fx;
    325 
    326         wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
    327         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
    328         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
    329         wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
    330 
    331         while (count >= 4) {
    332             int32x4_t res;
    333 
    334             res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
    335 
    336             vst1q_u32(xy, vreinterpretq_u32_s32(res));
    337 
    338             wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
    339             fx += dx+dx+dx+dx;
    340             xy += 4;
    341             count -= 4;
    342         }
    343     }
    344 
    345     while (--count >= 0) {
    346         *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
    347         fx += dx;
    348     }
    349 
    350     }
    351 }
    352 
    353 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
    354                                uint32_t xy[], int count, int x, int y) {
    355     SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
    356     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    357                              SkMatrix::kScale_Mask |
    358                              SkMatrix::kAffine_Mask)) == 0);
    359 
    360     PREAMBLE(s);
    361     SkPoint srcPt;
    362     s.fInvProc(s.fInvMatrix,
    363                SkIntToScalar(x) + SK_ScalarHalf,
    364                SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
    365 
    366     SkFixed oneX = s.fFilterOneX;
    367     SkFixed oneY = s.fFilterOneY;
    368     SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
    369     SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
    370     SkFixed dx = s.fInvSx;
    371     SkFixed dy = s.fInvKy;
    372     unsigned maxX = s.fBitmap->width() - 1;
    373     unsigned maxY = s.fBitmap->height() - 1;
    374 
    375     if (count >= 4) {
    376         int32x4_t wide_fy, wide_fx;
    377 
    378         wide_fx = vdupq_n_s32(fx);
    379         wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
    380         wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
    381         wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
    382 
    383         wide_fy = vdupq_n_s32(fy);
    384         wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
    385         wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
    386         wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
    387 
    388         while (count >= 4) {
    389             int32x4x2_t vxy;
    390 
    391             // do the X side, then the Y side, then interleave them
    392             vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
    393             vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
    394 
    395             // interleave as YXYXYXYX as part of the storing
    396             vst2q_s32((int32_t*)xy, vxy);
    397 
    398             // prepare next iteration
    399             wide_fx += vdupq_n_s32(dx+dx+dx+dx);
    400             fx += dx + dx + dx + dx;
    401             wide_fy += vdupq_n_s32(dy+dy+dy+dy);
    402             fy += dy+dy+dy+dy;
    403             xy += 8; // 4 x's, 4 y's
    404             count -= 4;
    405         }
    406     }
    407 
    408     while (--count >= 0) {
    409         // NB: writing Y/X
    410         *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
    411         fy += dy;
    412         *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
    413         fx += dx;
    414     }
    415 }
    416 
    417 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
    418                               uint32_t* SK_RESTRICT xy, int count,
    419                               int x, int y) {
    420     SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
    421 
    422     PREAMBLE(s);
    423     unsigned maxX = s.fBitmap->width() - 1;
    424     unsigned maxY = s.fBitmap->height() - 1;
    425     SkFixed oneX = s.fFilterOneX;
    426     SkFixed oneY = s.fFilterOneY;
    427 
    428     SkPerspIter iter(s.fInvMatrix,
    429                      SkIntToScalar(x) + SK_ScalarHalf,
    430                      SkIntToScalar(y) + SK_ScalarHalf, count);
    431 
    432     while ((count = iter.next()) != 0) {
    433         const SkFixed* SK_RESTRICT srcXY = iter.getXY();
    434 
    435         while (count >= 4) {
    436             int32x4_t wide_x, wide_y;
    437             int32x4x2_t vxy, vresyx;
    438 
    439             // load src:  x-y-x-y-x-y-x-y
    440             vxy = vld2q_s32(srcXY);
    441 
    442             // do the X side, then the Y side, then interleave them
    443             wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
    444             wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
    445 
    446             vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
    447             vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
    448 
    449             // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
    450             vst2q_s32((int32_t*)xy, vresyx);
    451 
    452             // on to the next iteration
    453             srcXY += 2*4;
    454             count -= 4;
    455             xy += 2*4;
    456         }
    457 
    458         while (--count >= 0) {
    459             // NB: we read x/y, we write y/x
    460             *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
    461                                        oneY PREAMBLE_ARG_Y);
    462             *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
    463                                        oneX PREAMBLE_ARG_X);
    464             srcXY += 2;
    465         }
    466     }
    467 }
    468 
    469 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
    470     SCALE_NOFILTER_NAME,
    471     SCALE_FILTER_NAME,
    472     AFFINE_NOFILTER_NAME,
    473     AFFINE_FILTER_NAME,
    474     PERSP_NOFILTER_NAME,
    475     PERSP_FILTER_NAME
    476 };
    477 
    478 #undef TILEX_PROCF_NEON8
    479 #undef TILEY_PROCF_NEON8
    480 #undef TILEX_PROCF_NEON4
    481 #undef TILEY_PROCF_NEON4
    482 #undef TILEX_LOW_BITS_NEON4
    483 #undef TILEY_LOW_BITS_NEON4
    484 
    485 #undef MAKENAME
    486 #undef TILEX_PROCF
    487 #undef TILEY_PROCF
    488 #ifdef CHECK_FOR_DECAL
    489     #undef CHECK_FOR_DECAL
    490 #endif
    491 
    492 #undef SCALE_NOFILTER_NAME
    493 #undef SCALE_FILTER_NAME
    494 #undef AFFINE_NOFILTER_NAME
    495 #undef AFFINE_FILTER_NAME
    496 #undef PERSP_NOFILTER_NAME
    497 #undef PERSP_FILTER_NAME
    498 
    499 #undef PREAMBLE
    500 #undef PREAMBLE_PARAM_X
    501 #undef PREAMBLE_PARAM_Y
    502 #undef PREAMBLE_ARG_X
    503 #undef PREAMBLE_ARG_Y
    504 
    505 #undef TILEX_LOW_BITS
    506 #undef TILEY_LOW_BITS
    507