Home | History | Annotate | Download | only in core
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkArenaAlloc.h"
      9 #include "SkGaussFilter.h"
     10 #include "SkMalloc.h"
     11 #include "SkMaskBlurFilter.h"
     12 #include "SkNx.h"
     13 #include "SkSafeMath.h"
     14 
     15 #include <cmath>
     16 #include <climits>
     17 
     18 static const double kPi = 3.14159265358979323846264338327950288;
     19 
     20 class BlurScanInterface {
     21 public:
     22     virtual ~BlurScanInterface() = default;
     23     virtual void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd,
     24                             uint8_t* dst, int dstStride,       uint8_t* dstEnd) const = 0;
     25     virtual bool canBlur4() { return false; }
     26     virtual void blur4Transpose(
     27         const uint8_t* src, int srcStride, const uint8_t* srcEnd,
     28               uint8_t* dst, int dstStride,       uint8_t* dstEnd) const {
     29         SK_ABORT("This should not be called.");
     30     }
     31 };
     32 
     33 class PlanningInterface {
     34 public:
     35     virtual ~PlanningInterface() = default;
     36     virtual size_t bufferSize() const = 0;
     37     virtual int    border() const = 0;
     38     virtual bool   needsBlur() const = 0;
     39     virtual BlurScanInterface* makeBlurScan(
     40         SkArenaAlloc* alloc, int width, uint32_t* buffer) const = 0;
     41 };
     42 
     43 class None final : public PlanningInterface {
     44 public:
     45     None() = default;
     46     size_t bufferSize() const override { return 0; }
     47     int    border()     const override { return 0; }
     48     bool   needsBlur()  const override { return false; }
     49     BlurScanInterface* makeBlurScan(
     50         SkArenaAlloc* alloc, int width, uint32_t* buffer) const override {
     51         SK_ABORT("Should never be called.");
     52         return nullptr;
     53     }
     54 };
     55 
     56 class PlanGauss final : public PlanningInterface {
     57 public:
     58     explicit PlanGauss(double sigma) {
     59         auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
     60         auto window = std::max(1, possibleWindow);
     61 
     62         fPass0Size = window - 1;
     63         fPass1Size = window - 1;
     64         fPass2Size = (window & 1) == 1 ? window - 1 : window;
     65 
     66         // Calculating the border is tricky. I will go through the odd case which is simpler, and
     67         // then through the even case. Given a stack of filters seven wide for the odd case of
     68         // three passes.
     69         //
     70         //        S
     71         //     aaaAaaa
     72         //     bbbBbbb
     73         //     cccCccc
     74         //        D
     75         //
     76         // The furthest changed pixel is when the filters are in the following configuration.
     77         //
     78         //                 S
     79         //           aaaAaaa
     80         //        bbbBbbb
     81         //     cccCccc
     82         //        D
     83         //
     84         //  The A pixel is calculated using the value S, the B uses A, and the C uses B, and
     85         // finally D is C. So, with a window size of seven the border is nine. In general, the
     86         // border is 3*((window - 1)/2).
     87         //
     88         // For even cases the filter stack is more complicated. The spec specifies two passes
     89         // of even filters and a final pass of odd filters. A stack for a width of six looks like
     90         // this.
     91         //
     92         //       S
     93         //    aaaAaa
     94         //     bbBbbb
     95         //    cccCccc
     96         //       D
     97         //
     98         // The furthest pixel looks like this.
     99         //
    100         //               S
    101         //          aaaAaa
    102         //        bbBbbb
    103         //    cccCccc
    104         //       D
    105         //
    106         // For a window of size, the border value is seven. In general the border is 3 *
    107         // (window/2) -1.
    108         fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
    109         fSlidingWindow = 2 * fBorder + 1;
    110 
    111         // If the window is odd then the divisor is just window ^ 3 otherwise,
    112         // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
    113         auto window2 = window * window;
    114         auto window3 = window2 * window;
    115         auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
    116 
    117         fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
    118     }
    119 
    120     size_t bufferSize() const override { return fPass0Size + fPass1Size + fPass2Size; }
    121 
    122     int    border()     const override { return fBorder; }
    123 
    124     bool needsBlur()    const override { return true; }
    125 
    126     BlurScanInterface* makeBlurScan(
    127         SkArenaAlloc* alloc, int width, uint32_t* buffer) const override
    128     {
    129         uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
    130         buffer0 = buffer;
    131         buffer0End = buffer1 = buffer0 + fPass0Size;
    132         buffer1End = buffer2 = buffer1 + fPass1Size;
    133         buffer2End = buffer2 + fPass2Size;
    134         int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
    135 
    136         return alloc->make<Gauss>(
    137             fWeight, noChangeCount,
    138             buffer0, buffer0End,
    139             buffer1, buffer1End,
    140             buffer2, buffer2End);
    141     }
    142 
    143 public:
    144     class Gauss final : public BlurScanInterface {
    145     public:
    146         Gauss(uint64_t weight, int noChangeCount,
    147               uint32_t* buffer0, uint32_t* buffer0End,
    148               uint32_t* buffer1, uint32_t* buffer1End,
    149               uint32_t* buffer2, uint32_t* buffer2End)
    150             : fWeight{weight}
    151             , fNoChangeCount{noChangeCount}
    152             , fBuffer0{buffer0}
    153             , fBuffer0End{buffer0End}
    154             , fBuffer1{buffer1}
    155             , fBuffer1End{buffer1End}
    156             , fBuffer2{buffer2}
    157             , fBuffer2End{buffer2End}
    158         { }
    159 
    160         void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd,
    161                         uint8_t* dst, int dstStride, uint8_t* dstEnd) const override {
    162             auto buffer0Cursor = fBuffer0;
    163             auto buffer1Cursor = fBuffer1;
    164             auto buffer2Cursor = fBuffer2;
    165 
    166             std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
    167 
    168             uint32_t sum0 = 0;
    169             uint32_t sum1 = 0;
    170             uint32_t sum2 = 0;
    171 
    172             // Consume the source generating pixels.
    173             for (auto srcCursor = src;
    174                  srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) {
    175                 uint32_t leadingEdge = *srcCursor;
    176                 sum0 += leadingEdge;
    177                 sum1 += sum0;
    178                 sum2 += sum1;
    179 
    180                 *dst = this->finalScale(sum2);
    181 
    182                 sum2 -= *buffer2Cursor;
    183                 *buffer2Cursor = sum1;
    184                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    185 
    186                 sum1 -= *buffer1Cursor;
    187                 *buffer1Cursor = sum0;
    188                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    189 
    190                 sum0 -= *buffer0Cursor;
    191                 *buffer0Cursor = leadingEdge;
    192                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    193             }
    194 
    195             // The leading edge is off the right side of the mask.
    196             for (int i = 0; i < fNoChangeCount; i++) {
    197                 uint32_t leadingEdge = 0;
    198                 sum0 += leadingEdge;
    199                 sum1 += sum0;
    200                 sum2 += sum1;
    201 
    202                 *dst = this->finalScale(sum2);
    203 
    204                 sum2 -= *buffer2Cursor;
    205                 *buffer2Cursor = sum1;
    206                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    207 
    208                 sum1 -= *buffer1Cursor;
    209                 *buffer1Cursor = sum0;
    210                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    211 
    212                 sum0 -= *buffer0Cursor;
    213                 *buffer0Cursor = leadingEdge;
    214                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    215 
    216                 dst += dstStride;
    217             }
    218 
    219             // Starting from the right, fill in the rest of the buffer.
    220             std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
    221 
    222             sum0 = sum1 = sum2 = 0;
    223 
    224             uint8_t* dstCursor = dstEnd;
    225             const uint8_t* srcCursor = srcEnd;
    226             while (dstCursor > dst) {
    227                 dstCursor -= dstStride;
    228                 srcCursor -= srcStride;
    229                 uint32_t leadingEdge = *srcCursor;
    230                 sum0 += leadingEdge;
    231                 sum1 += sum0;
    232                 sum2 += sum1;
    233 
    234                 *dstCursor = this->finalScale(sum2);
    235 
    236                 sum2 -= *buffer2Cursor;
    237                 *buffer2Cursor = sum1;
    238                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    239 
    240                 sum1 -= *buffer1Cursor;
    241                 *buffer1Cursor = sum0;
    242                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    243 
    244                 sum0 -= *buffer0Cursor;
    245                 *buffer0Cursor = leadingEdge;
    246                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    247             }
    248         }
    249 
    250     private:
    251         static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
    252 
    253         uint8_t finalScale(uint32_t sum) const {
    254             return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
    255         }
    256 
    257         uint64_t  fWeight;
    258         int       fNoChangeCount;
    259         uint32_t* fBuffer0;
    260         uint32_t* fBuffer0End;
    261         uint32_t* fBuffer1;
    262         uint32_t* fBuffer1End;
    263         uint32_t* fBuffer2;
    264         uint32_t* fBuffer2End;
    265     };
    266 
    267     uint64_t fWeight;
    268     int      fBorder;
    269     int      fSlidingWindow;
    270     int      fPass0Size;
    271     int      fPass1Size;
    272     int      fPass2Size;
    273 };
    274 
    275 // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
    276 // using the Gauss filter. It also limits the size of buffers used hold intermediate values.
    277 // Explanation of maximums:
    278 //   sum0 = window * 255
    279 //   sum1 = window * sum0 -> window * window * 255
    280 //   sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255
    281 //
    282 //   The value window^3 * 255 must fit in a uint32_t. So,
    283 //      window^3 < 2^32. window = 255.
    284 //
    285 //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
    286 //   For window <= 255, the largest value for sigma is 136.
    287 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
    288     : fSigmaW{SkTPin(sigmaW, 0.0, 136.0)}
    289     , fSigmaH{SkTPin(sigmaH, 0.0, 136.0)}
    290 {
    291     SkASSERT(sigmaW >= 0);
    292     SkASSERT(sigmaH >= 0);
    293 }
    294 
    295 bool SkMaskBlurFilter::hasNoBlur() const {
    296     return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
    297 }
    298 
    299 static SkMask prepare_destination(int radiusX, int radiusY, const SkMask& src) {
    300     SkSafeMath safe;
    301 
    302     SkMask dst;
    303     // dstW = srcW + 2 * radiusX;
    304     size_t dstW = safe.add(src.fBounds.width(), safe.add(radiusX, radiusX));
    305     // dstH = srcH + 2 * radiusY;
    306     size_t dstH = safe.add(src.fBounds.height(), safe.add(radiusY, radiusY));
    307 
    308     dst.fBounds.set(0, 0, SkTo<int>(dstW), SkTo<int>(dstH));
    309     dst.fBounds.offset(src.fBounds.x(), src.fBounds.y());
    310     dst.fBounds.offset(-radiusX, -radiusY);
    311 
    312     dst.fImage = nullptr;
    313     dst.fRowBytes = SkTo<uint32_t>(dstW);
    314     dst.fFormat = SkMask::kA8_Format;
    315 
    316     size_t toAlloc = safe.mul(dstW, dstH);
    317 
    318     if (safe && src.fImage != nullptr) {
    319         dst.fImage = SkMask::AllocImage(toAlloc);
    320     }
    321 
    322     return dst;
    323 }
    324 
    325 static constexpr uint16_t _____ = 0u;
    326 static constexpr uint16_t kHalf = 0x80u;
    327 
    328 static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) {
    329     uint8_t buffer[8];
    330     if (width < 8) {
    331         sk_bzero(buffer, sizeof(buffer));
    332         for (int i = 0; i < width; i++) {
    333             buffer[i] = from[i];
    334         }
    335         from = buffer;
    336     }
    337     auto v = SkNx_cast<uint16_t>(Sk8b::Load(from));
    338     // Convert from 0-255 to 8.8 encoding.
    339     return v << 8;
    340 };
    341 
    342 static SK_ALWAYS_INLINE void store(uint8_t* to, const Sk8h& v, int width) {
    343     Sk8b b = SkNx_cast<uint8_t>(v >> 8);
    344     if (width == 8) {
    345         b.store(to);
    346     } else {
    347         uint8_t buffer[8];
    348         b.store(buffer);
    349         for (int i = 0; i < width; i++) {
    350             to[i] = buffer[i];
    351         }
    352     }
    353 };
    354 
    355 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
    356 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
    357 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
    358 // resulting in a 8.8 format.
    359 //
    360 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
    361 // system is setup to minimize the number of multiplies needed.
    362 //
    363 // Explanation:
    364 //    Blurring a specific mask value is given by the following equation where D_n is the resulting
    365 // mask value and S_n is the source value. The example below is for a filter with a radius of 1
    366 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
    367 // aligned. The filter is given by G_n where n is the symmetric filter value.
    368 //
    369 //   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
    370 //
    371 // We can start the source index at an offset relative to the destination separated by the
    372 // radius. This results in a non-traditional restating of the above filter.
    373 //
    374 //  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
    375 //
    376 // If we look at three specific consecutive destinations the following equations result:
    377 //
    378 //   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
    379 //   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
    380 //   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
    381 //
    382 // In the above equations, notice that S[7] is used in all three. In particular, two values are
    383 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
    384 // D[7].
    385 //
    386 // From the point of view of a source value we end up with the following three equations.
    387 //
    388 // Given S[7]:
    389 //   D[5] += S[7]*G[1]
    390 //   D[6] += S[7]*G[0]
    391 //   D[7] += S[7]*G[1]
    392 //
    393 // In General:
    394 //   D[n]   += S[n]*G[1]
    395 //   D[n+1] += S[n]*G[0]
    396 //   D[n+2] += S[n]*G[1]
    397 //
    398 // Now these equations can be ganged using SIMD to form:
    399 //   D[n..n+7]   += S[n..n+7]*G[1]
    400 //   D[n+1..n+8] += S[n..n+7]*G[0]
    401 //   D[n+2..n+9] += S[n..n+7]*G[1]
    402 // The next set of values becomes.
    403 //   D[n+8..n+15]  += S[n+8..n+15]*G[1]
    404 //   D[n+9..n+16]  += S[n+8..n+15]*G[0]
    405 //   D[n+10..n+17] += S[n+8..n+15]*G[1]
    406 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
    407 // S[n..7] and S[n+8..n+15].
    408 //
    409 // Just one more transformation allows the code to maintain all working values in
    410 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
    411 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
    412 //
    413 //   D[n..n+7]  += S[n..n+7] * G[1]
    414 //   D[n..n+8]  += {0, S[n..n+7] * G[0]}
    415 //   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    416 //
    417 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
    418 // register d8. In addition, S[0..n+7] becomes s0.
    419 //
    420 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
    421 //
    422 // Sk8h v0 = s0*G[0]
    423 // Sk8h v1 = s0*G[1]
    424 // /* D[n..n+7]  += S[n..n+7] * G[1] */
    425 // d0 += v1;
    426 // /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
    427 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
    428 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
    429 // /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
    430 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
    431 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
    432 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
    433 
    434 static SK_ALWAYS_INLINE void blur_x_radius_1(
    435         const Sk8h& s0,
    436         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
    437         Sk8h* d0, Sk8h* d8) {
    438 
    439     auto v1 = s0.mulHi(g1);
    440     auto v0 = s0.mulHi(g0);
    441 
    442     // D[n..n+7]  += S[n..n+7] * G[1]
    443     *d0 += v1;
    444 
    445     //D[n..n+8]  += {0, S[n..n+7] * G[0]}
    446     *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
    447     *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
    448 
    449     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    450     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
    451     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
    452 
    453 }
    454 
    455 static SK_ALWAYS_INLINE void blur_x_radius_2(
    456         const Sk8h& s0,
    457         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
    458         Sk8h* d0, Sk8h* d8) {
    459     auto v0 = s0.mulHi(g0);
    460     auto v1 = s0.mulHi(g1);
    461     auto v2 = s0.mulHi(g2);
    462 
    463     // D[n..n+7]  += S[n..n+7] * G[2]
    464     *d0 += v2;
    465 
    466     // D[n..n+8]  += {0, S[n..n+7] * G[1]}
    467     *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
    468     *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
    469 
    470     // D[n..n+9]  += {0, 0, S[n..n+7] * G[0]}
    471     *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
    472     *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
    473 
    474     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
    475     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
    476     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
    477 
    478     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[2]}
    479     *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
    480     *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
    481 }
    482 
    483 static SK_ALWAYS_INLINE void blur_x_radius_3(
    484         const Sk8h& s0,
    485         const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
    486         Sk8h* d0, Sk8h* d8) {
    487     auto v0 = s0.mulHi(gauss0);
    488     auto v1 = s0.mulHi(gauss1);
    489     auto v2 = s0.mulHi(gauss2);
    490     auto v3 = s0.mulHi(gauss3);
    491 
    492     // D[n..n+7]  += S[n..n+7] * G[3]
    493     *d0 += v3;
    494 
    495     // D[n..n+8]  += {0, S[n..n+7] * G[2]}
    496     *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
    497     *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
    498 
    499     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    500     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
    501     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
    502 
    503     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[0]}
    504     *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
    505     *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
    506 
    507     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[1]}
    508     *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
    509     *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
    510 
    511     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
    512     *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
    513     *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
    514 
    515     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
    516     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
    517     *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
    518 }
    519 
    520 static SK_ALWAYS_INLINE void blur_x_radius_4(
    521         const Sk8h& s0,
    522         const Sk8h& gauss0,
    523         const Sk8h& gauss1,
    524         const Sk8h& gauss2,
    525         const Sk8h& gauss3,
    526         const Sk8h& gauss4,
    527         Sk8h* d0, Sk8h* d8) {
    528     auto v0 = s0.mulHi(gauss0);
    529     auto v1 = s0.mulHi(gauss1);
    530     auto v2 = s0.mulHi(gauss2);
    531     auto v3 = s0.mulHi(gauss3);
    532     auto v4 = s0.mulHi(gauss4);
    533 
    534     // D[n..n+7]  += S[n..n+7] * G[4]
    535     *d0 += v4;
    536 
    537     // D[n..n+8]  += {0, S[n..n+7] * G[3]}
    538     *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
    539     *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
    540 
    541     // D[n..n+9]  += {0, 0, S[n..n+7] * G[2]}
    542     *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
    543     *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
    544 
    545     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
    546     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
    547     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
    548 
    549     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[0]}
    550     *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
    551     *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
    552 
    553     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
    554     *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
    555     *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
    556 
    557     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
    558     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
    559     *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
    560 
    561     // D[n..n+14]  += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
    562     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
    563     *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
    564 
    565     // D[n..n+15]  += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
    566     *d8 += v4;
    567 }
    568 
    569 using BlurX = decltype(blur_x_radius_1);
    570 
    571 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
    572 static SK_ALWAYS_INLINE void blur_row(
    573         BlurX blur,
    574         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    575         const uint8_t* src, int srcW,
    576               uint8_t* dst, int dstW) {
    577     // Clear the buffer to handle summing wider than source.
    578     Sk8h d0{kHalf}, d8{kHalf};
    579 
    580     // Go by multiples of 8 in src.
    581     int x = 0;
    582     for (; x <= srcW - 8; x += 8) {
    583         blur(load(src, 8), g0, g1, g2, g3, g4, &d0, &d8);
    584 
    585         store(dst, d0, 8);
    586 
    587         d0 = d8;
    588         d8 = Sk8h{kHalf};
    589 
    590         src += 8;
    591         dst += 8;
    592     }
    593 
    594     // There are src values left, but the remainder of src values is not a multiple of 8.
    595     int srcTail = srcW - x;
    596     if (srcTail > 0) {
    597 
    598         blur(load(src, srcTail), g0, g1, g2, g3, g4, &d0, &d8);
    599 
    600         int dstTail = std::min(8, dstW - x);
    601         store(dst, d0, dstTail);
    602 
    603         d0 = d8;
    604         dst += dstTail;
    605         x += dstTail;
    606     }
    607 
    608     // There are dst mask values to complete.
    609     int dstTail = dstW - x;
    610     if (dstTail > 0) {
    611         store(dst, d0, dstTail);
    612     }
    613 }
    614 
    615 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
    616 static SK_ALWAYS_INLINE void blur_x_rect(
    617         BlurX blur,
    618         uint16_t* gauss,
    619         const uint8_t* src, size_t srcStride, int srcW,
    620               uint8_t* dst, size_t dstStride, int dstW, int dstH) {
    621 
    622     Sk8h g0{gauss[0]},
    623          g1{gauss[1]},
    624          g2{gauss[2]},
    625          g3{gauss[3]},
    626          g4{gauss[4]};
    627 
    628     // Blur *ALL* the rows.
    629     for (int y = 0; y < dstH; y++) {
    630         blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
    631         src += srcStride;
    632         dst += dstStride;
    633     }
    634 }
    635 
    636 SK_ATTRIBUTE(noinline) static void direct_blur_x(
    637     int radius, uint16_t* gauss,
    638     const uint8_t* src, size_t srcStride, int srcW,
    639           uint8_t* dst, size_t dstStride, int dstW, int dstH) {
    640 
    641     switch (radius) {
    642         case 1:
    643             blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    644             break;
    645 
    646         case 2:
    647             blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    648             break;
    649 
    650         case 3:
    651             blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    652             break;
    653 
    654         case 4:
    655             blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    656             break;
    657 
    658         default:
    659             SkASSERTF(false, "The radius %d is not handled\n", radius);
    660     }
    661 }
    662 
    663 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
    664 // functions, but end up being simpler because there is no complicated shift of registers. We
    665 // start with the non-traditional form of the gaussian filter. In the following r is the value
    666 // when added generates the next value in the column.
    667 //
    668 //   D[n+0r] = S[n+0r]*G[1]
    669 //           + S[n+1r]*G[0]
    670 //           + S[n+2r]*G[1]
    671 //
    672 // Expanding out in a way similar to blur_x_radius_N for specific values of n.
    673 //
    674 //   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
    675 //   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
    676 //   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
    677 //
    678 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
    679 // can look at the calculation form the point of view of a source value.
    680 //
    681 //   Given S[n+0r]:
    682 //   D[n+0r] += S[n+0r]*G[1];
    683 //   /* D[n+0r] is done and can be stored now. */
    684 //   D[n+1r] += S[n+0r]*G[0];
    685 //   D[n+2r]  = S[n+0r]*G[1];
    686 //
    687 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
    688 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
    689 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
    690 // how values flow in the following two iterations in source.
    691 //
    692 //   D[n+0r] += S[n+0r]*G[1]
    693 //   D[n+1r] += S[n+0r]*G[0]
    694 //   D[n+2r]  = S[n+0r]*G[1]
    695 //   /* ------- */
    696 //   D[n+1r] += S[n+1r]*G[1]
    697 //   D[n+2r] += S[n+1r]*G[0]
    698 //   D[n+3r]  = S[n+1r]*G[1]
    699 //
    700 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes
    701 // to the following.
    702 //
    703 //   answer = d01 + S[n+0r]*G[1]
    704 //   d01    = d12 + S[n+0r]*G[0]
    705 //   d12    =       S[n+0r]*G[1]
    706 //   return answer
    707 //
    708 // Finally, this can be ganged into SIMD style.
    709 //   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
    710 //   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
    711 //   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
    712 //   return answer[0..7]
    713 static SK_ALWAYS_INLINE Sk8h blur_y_radius_1(
    714         const Sk8h& s0,
    715         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
    716         Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
    717     auto v0 = s0.mulHi(g0);
    718     auto v1 = s0.mulHi(g1);
    719 
    720     Sk8h answer = *d01 + v1;
    721            *d01 = *d12 + v0;
    722            *d12 =        v1 + kHalf;
    723 
    724     return answer;
    725 }
    726 
    727 static SK_ALWAYS_INLINE Sk8h blur_y_radius_2(
    728         const Sk8h& s0,
    729         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
    730         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
    731     auto v0 = s0.mulHi(g0);
    732     auto v1 = s0.mulHi(g1);
    733     auto v2 = s0.mulHi(g2);
    734 
    735     Sk8h answer = *d01 + v2;
    736            *d01 = *d12 + v1;
    737            *d12 = *d23 + v0;
    738            *d23 = *d34 + v1;
    739            *d34 =        v2 + kHalf;
    740 
    741     return answer;
    742 }
    743 
    744 static SK_ALWAYS_INLINE Sk8h blur_y_radius_3(
    745         const Sk8h& s0,
    746         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
    747         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
    748     auto v0 = s0.mulHi(g0);
    749     auto v1 = s0.mulHi(g1);
    750     auto v2 = s0.mulHi(g2);
    751     auto v3 = s0.mulHi(g3);
    752 
    753     Sk8h answer = *d01 + v3;
    754            *d01 = *d12 + v2;
    755            *d12 = *d23 + v1;
    756            *d23 = *d34 + v0;
    757            *d34 = *d45 + v1;
    758            *d45 = *d56 + v2;
    759            *d56 =        v3 + kHalf;
    760 
    761     return answer;
    762 }
    763 
    764 static SK_ALWAYS_INLINE Sk8h blur_y_radius_4(
    765     const Sk8h& s0,
    766     const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    767     Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
    768     auto v0 = s0.mulHi(g0);
    769     auto v1 = s0.mulHi(g1);
    770     auto v2 = s0.mulHi(g2);
    771     auto v3 = s0.mulHi(g3);
    772     auto v4 = s0.mulHi(g4);
    773 
    774     Sk8h answer = *d01 + v4;
    775            *d01 = *d12 + v3;
    776            *d12 = *d23 + v2;
    777            *d23 = *d34 + v1;
    778            *d34 = *d45 + v0;
    779            *d45 = *d56 + v1;
    780            *d56 = *d67 + v2;
    781            *d67 = *d78 + v3;
    782            *d78 =        v4 + kHalf;
    783 
    784     return answer;
    785 }
    786 
    787 using BlurY = decltype(blur_y_radius_1);
    788 
    789 // BlurY will be one of blur_y_radius_(1|2|3|4).
    790 static SK_ALWAYS_INLINE void blur_column(
    791         BlurY blur, int radius, int width,
    792         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    793         const uint8_t* src, size_t srcStride, int srcH,
    794         uint8_t* dst, size_t dstStride) {
    795     Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
    796          d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
    797 
    798     auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
    799         store(to, v0, width);
    800         to += dstStride;
    801         store(to, v1, width);
    802         return to + dstStride;
    803     };
    804 
    805     for (int y = 0; y < srcH; y += 1) {
    806         auto s = load(src, width);
    807         auto b = blur(s,
    808                       g0, g1, g2, g3, g4,
    809                       &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
    810         store(dst, b, width);
    811         src += srcStride;
    812         dst += dstStride;
    813     }
    814 
    815     if (radius >= 1) {
    816         dst = flush(dst, d01, d12);
    817     }
    818     if (radius >= 2) {
    819         dst = flush(dst, d23, d34);
    820     }
    821     if (radius >= 3) {
    822         dst = flush(dst, d45, d56);
    823     }
    824     if (radius >= 4) {
    825               flush(dst, d67, d78);
    826     }
    827 }
    828 
    829 // BlurY will be one of blur_y_radius_(1|2|3|4).
    830 static SK_ALWAYS_INLINE void blur_y_rect(
    831         BlurY blur, int radius, uint16_t *gauss,
    832         const uint8_t *src, size_t srcStride, int srcW, int srcH,
    833         uint8_t *dst, size_t dstStride) {
    834 
    835     Sk8h g0{gauss[0]},
    836          g1{gauss[1]},
    837          g2{gauss[2]},
    838          g3{gauss[3]},
    839          g4{gauss[4]};
    840 
    841     int x = 0;
    842     for (; x <= srcW - 8; x += 8) {
    843         blur_column(blur, radius, 8,
    844                     g0, g1, g2, g3, g4,
    845                     src, srcStride, srcH,
    846                     dst, dstStride);
    847         src += 8;
    848         dst += 8;
    849     }
    850 
    851     int xTail = srcW - x;
    852     if (xTail > 0) {
    853         blur_column(blur, radius, xTail,
    854                     g0, g1, g2, g3, g4,
    855                     src, srcStride, srcH,
    856                     dst, dstStride);
    857     }
    858 }
    859 
    860 SK_ATTRIBUTE(noinline) static void direct_blur_y(
    861         int radius, uint16_t* gauss,
    862         const uint8_t* src, size_t srcStride, int srcW, int srcH,
    863               uint8_t* dst, size_t dstStride) {
    864 
    865     switch (radius) {
    866         case 1:
    867             blur_y_rect(blur_y_radius_1, 1, gauss,
    868                         src, srcStride, srcW, srcH,
    869                         dst, dstStride);
    870             break;
    871 
    872         case 2:
    873             blur_y_rect(blur_y_radius_2, 2, gauss,
    874                         src, srcStride, srcW, srcH,
    875                         dst, dstStride);
    876             break;
    877 
    878         case 3:
    879             blur_y_rect(blur_y_radius_3, 3, gauss,
    880                         src, srcStride, srcW, srcH,
    881                         dst, dstStride);
    882             break;
    883 
    884         case 4:
    885             blur_y_rect(blur_y_radius_4, 4, gauss,
    886                         src, srcStride, srcW, srcH,
    887                         dst, dstStride);
    888             break;
    889 
    890         default:
    891             SkASSERTF(false, "The radius %d is not handled\n", radius);
    892     }
    893 }
    894 
    895 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
    896     SkASSERT(0 <= sigmaX && sigmaX < 2);
    897     SkASSERT(0 <= sigmaY && sigmaY < 2);
    898 
    899     SkGaussFilter filterX{sigmaX, SkGaussFilter::Type::Bessel},
    900                   filterY{sigmaY, SkGaussFilter::Type::Bessel};
    901 
    902     int radiusX = filterX.radius(),
    903         radiusY = filterY.radius();
    904 
    905     SkASSERT(radiusX <= 4 && radiusY <= 4);
    906 
    907     auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
    908         int i = 0;
    909         for (double d : filter) {
    910             factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
    911         }
    912     };
    913 
    914     uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
    915              gaussFactorsY[SkGaussFilter::kGaussArrayMax];
    916 
    917     prepareGauss(filterX, gaussFactorsX);
    918     prepareGauss(filterY, gaussFactorsY);
    919 
    920     *dst = prepare_destination(radiusX, radiusY, src);
    921     if (src.fImage == nullptr) {
    922         return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
    923     }
    924     if (dst->fImage == nullptr) {
    925         dst->fBounds.setEmpty();
    926         return {0, 0};
    927     }
    928 
    929     int srcW = src.fBounds.width(),
    930         srcH = src.fBounds.height();
    931 
    932     int dstW = dst->fBounds.width(),
    933         dstH = dst->fBounds.height();
    934 
    935     size_t srcStride = src.fRowBytes,
    936            dstStride = dst->fRowBytes;
    937 
    938     //TODO: handle bluring in only one direction.
    939 
    940     // Blur vertically and copy to destination.
    941     direct_blur_y(radiusY, gaussFactorsY,
    942                   src.fImage,  srcStride, srcW, srcH,
    943                   dst->fImage + radiusX, dstStride);
    944 
    945     // Blur horizontally in place.
    946     direct_blur_x(radiusX, gaussFactorsX,
    947                   dst->fImage + radiusX,  dstStride, srcW,
    948                   dst->fImage,            dstStride, dstW, dstH);
    949 
    950     return {radiusX, radiusY};
    951 }
    952 
    953 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
    954 // API forces the sigmas to be the same.
    955 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
    956 
    957     if (fSigmaW < 2.0 && fSigmaH < 2.0) {
    958         return small_blur(fSigmaW, fSigmaH, src, dst);
    959     }
    960 
    961     // 1024 is a place holder guess until more analysis can be done.
    962     SkSTArenaAlloc<1024> alloc;
    963 
    964     PlanningInterface* planW = alloc.make<PlanGauss>(fSigmaW);
    965     PlanningInterface* planH = alloc.make<PlanGauss>(fSigmaH);
    966 
    967     int borderW = planW->border(),
    968         borderH = planH->border();
    969     SkASSERT(borderH >= 0 && borderW >= 0);
    970 
    971     *dst = prepare_destination(borderW, borderH, src);
    972     if (src.fImage == nullptr) {
    973         return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
    974     }
    975     if (dst->fImage == nullptr) {
    976         dst->fBounds.setEmpty();
    977         return {0, 0};
    978     }
    979 
    980     int srcW = src.fBounds.width(),
    981         srcH = src.fBounds.height(),
    982         dstW = dst->fBounds.width(),
    983         dstH = dst->fBounds.height();
    984     SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
    985 
    986     auto bufferSize = std::max(planW->bufferSize(), planH->bufferSize());
    987     auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
    988 
    989     if (planW->needsBlur() && planH->needsBlur()) {
    990         // Blur both directions.
    991         int tmpW = srcH,
    992             tmpH = dstW;
    993 
    994         auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
    995 
    996         // Blur horizontally, and transpose.
    997         auto scanW = planW->makeBlurScan(&alloc, srcW, buffer);
    998         for (int y = 0; y < srcH; y++) {
    999             auto srcStart = &src.fImage[y * src.fRowBytes];
   1000             auto tmpStart = &tmp[y];
   1001             scanW->blur(srcStart,    1, srcStart + srcW,
   1002                         tmpStart, tmpW, tmpStart + tmpW * tmpH);
   1003         }
   1004 
   1005         // Blur vertically (scan in memory order because of the transposition),
   1006         // and transpose back to the original orientation.
   1007         auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer);
   1008         for (int y = 0; y < tmpH; y++) {
   1009             auto tmpStart = &tmp[y * tmpW];
   1010             auto dstStart = &dst->fImage[y];
   1011 
   1012             scanH->blur(tmpStart, 1, tmpStart + tmpW,
   1013                         dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
   1014         }
   1015     } else {
   1016         // Copy to dst. No Blur.
   1017         SkASSERT(false);    // should not get here
   1018         for (int y = 0; y < srcH; y++) {
   1019             std::memcpy(&dst->fImage[y * dst->fRowBytes], &src.fImage[y * src.fRowBytes], dstW);
   1020         }
   1021     }
   1022 
   1023     return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
   1024 }
   1025