Home | History | Annotate | Download | only in core
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkMaskBlurFilter.h"
      9 
     10 #include "SkArenaAlloc.h"
     11 #include "SkColorPriv.h"
     12 #include "SkGaussFilter.h"
     13 #include "SkMalloc.h"
     14 #include "SkNx.h"
     15 #include "SkTemplates.h"
     16 #include "SkTo.h"
     17 
     18 #include <cmath>
     19 #include <climits>
     20 
     21 namespace {
     22 static const double kPi = 3.14159265358979323846264338327950288;
     23 
     24 class PlanGauss final {
     25 public:
     26     explicit PlanGauss(double sigma) {
     27         auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
     28         auto window = std::max(1, possibleWindow);
     29 
     30         fPass0Size = window - 1;
     31         fPass1Size = window - 1;
     32         fPass2Size = (window & 1) == 1 ? window - 1 : window;
     33 
     34         // Calculating the border is tricky. I will go through the odd case which is simpler, and
     35         // then through the even case. Given a stack of filters seven wide for the odd case of
     36         // three passes.
     37         //
     38         //        S
     39         //     aaaAaaa
     40         //     bbbBbbb
     41         //     cccCccc
     42         //        D
     43         //
     44         // The furthest changed pixel is when the filters are in the following configuration.
     45         //
     46         //                 S
     47         //           aaaAaaa
     48         //        bbbBbbb
     49         //     cccCccc
     50         //        D
     51         //
     52         //  The A pixel is calculated using the value S, the B uses A, and the C uses B, and
     53         // finally D is C. So, with a window size of seven the border is nine. In general, the
     54         // border is 3*((window - 1)/2).
     55         //
     56         // For even cases the filter stack is more complicated. The spec specifies two passes
     57         // of even filters and a final pass of odd filters. A stack for a width of six looks like
     58         // this.
     59         //
     60         //       S
     61         //    aaaAaa
     62         //     bbBbbb
     63         //    cccCccc
     64         //       D
     65         //
     66         // The furthest pixel looks like this.
     67         //
     68         //               S
     69         //          aaaAaa
     70         //        bbBbbb
     71         //    cccCccc
     72         //       D
     73         //
     74         // For a window of size, the border value is seven. In general the border is 3 *
     75         // (window/2) -1.
     76         fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
     77         fSlidingWindow = 2 * fBorder + 1;
     78 
     79         // If the window is odd then the divisor is just window ^ 3 otherwise,
     80         // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
     81         auto window2 = window * window;
     82         auto window3 = window2 * window;
     83         auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
     84 
     85         fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
     86     }
     87 
     88     size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; }
     89 
     90     int    border()     const { return fBorder; }
     91 
     92 public:
     93     class Scan {
     94     public:
     95         Scan(uint64_t weight, int noChangeCount,
     96              uint32_t* buffer0, uint32_t* buffer0End,
     97              uint32_t* buffer1, uint32_t* buffer1End,
     98              uint32_t* buffer2, uint32_t* buffer2End)
     99             : fWeight{weight}
    100             , fNoChangeCount{noChangeCount}
    101             , fBuffer0{buffer0}
    102             , fBuffer0End{buffer0End}
    103             , fBuffer1{buffer1}
    104             , fBuffer1End{buffer1End}
    105             , fBuffer2{buffer2}
    106             , fBuffer2End{buffer2End}
    107         { }
    108 
    109         template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd,
    110                     uint8_t* dst, int dstStride, uint8_t* dstEnd) const {
    111             auto buffer0Cursor = fBuffer0;
    112             auto buffer1Cursor = fBuffer1;
    113             auto buffer2Cursor = fBuffer2;
    114 
    115             std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
    116 
    117             uint32_t sum0 = 0;
    118             uint32_t sum1 = 0;
    119             uint32_t sum2 = 0;
    120 
    121             // Consume the source generating pixels.
    122             for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) {
    123                 uint32_t leadingEdge = *src;
    124                 sum0 += leadingEdge;
    125                 sum1 += sum0;
    126                 sum2 += sum1;
    127 
    128                 *dst = this->finalScale(sum2);
    129 
    130                 sum2 -= *buffer2Cursor;
    131                 *buffer2Cursor = sum1;
    132                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    133 
    134                 sum1 -= *buffer1Cursor;
    135                 *buffer1Cursor = sum0;
    136                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    137 
    138                 sum0 -= *buffer0Cursor;
    139                 *buffer0Cursor = leadingEdge;
    140                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    141             }
    142 
    143             // The leading edge is off the right side of the mask.
    144             for (int i = 0; i < fNoChangeCount; i++) {
    145                 uint32_t leadingEdge = 0;
    146                 sum0 += leadingEdge;
    147                 sum1 += sum0;
    148                 sum2 += sum1;
    149 
    150                 *dst = this->finalScale(sum2);
    151 
    152                 sum2 -= *buffer2Cursor;
    153                 *buffer2Cursor = sum1;
    154                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    155 
    156                 sum1 -= *buffer1Cursor;
    157                 *buffer1Cursor = sum0;
    158                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    159 
    160                 sum0 -= *buffer0Cursor;
    161                 *buffer0Cursor = leadingEdge;
    162                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    163 
    164                 dst += dstStride;
    165             }
    166 
    167             // Starting from the right, fill in the rest of the buffer.
    168             std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
    169 
    170             sum0 = sum1 = sum2 = 0;
    171 
    172             uint8_t* dstCursor = dstEnd;
    173             AlphaIter src = srcEnd;
    174             while (dstCursor > dst) {
    175                 dstCursor -= dstStride;
    176                 uint32_t leadingEdge = *(--src);
    177                 sum0 += leadingEdge;
    178                 sum1 += sum0;
    179                 sum2 += sum1;
    180 
    181                 *dstCursor = this->finalScale(sum2);
    182 
    183                 sum2 -= *buffer2Cursor;
    184                 *buffer2Cursor = sum1;
    185                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
    186 
    187                 sum1 -= *buffer1Cursor;
    188                 *buffer1Cursor = sum0;
    189                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
    190 
    191                 sum0 -= *buffer0Cursor;
    192                 *buffer0Cursor = leadingEdge;
    193                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
    194             }
    195         }
    196 
    197     private:
    198         static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
    199 
    200         uint8_t finalScale(uint32_t sum) const {
    201             return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
    202         }
    203 
    204         uint64_t  fWeight;
    205         int       fNoChangeCount;
    206         uint32_t* fBuffer0;
    207         uint32_t* fBuffer0End;
    208         uint32_t* fBuffer1;
    209         uint32_t* fBuffer1End;
    210         uint32_t* fBuffer2;
    211         uint32_t* fBuffer2End;
    212     };
    213 
    214     Scan makeBlurScan(int width, uint32_t* buffer) const {
    215         uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
    216         buffer0 = buffer;
    217         buffer0End = buffer1 = buffer0 + fPass0Size;
    218         buffer1End = buffer2 = buffer1 + fPass1Size;
    219         buffer2End = buffer2 + fPass2Size;
    220         int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
    221 
    222         return Scan(
    223             fWeight, noChangeCount,
    224             buffer0, buffer0End,
    225             buffer1, buffer1End,
    226             buffer2, buffer2End);
    227     }
    228 
    229     uint64_t fWeight;
    230     int      fBorder;
    231     int      fSlidingWindow;
    232     int      fPass0Size;
    233     int      fPass1Size;
    234     int      fPass2Size;
    235 };
    236 
    237 } // namespace
    238 
    239 // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
    240 // using the Gauss filter. It also limits the size of buffers used hold intermediate values.
    241 // Explanation of maximums:
    242 //   sum0 = window * 255
    243 //   sum1 = window * sum0 -> window * window * 255
    244 //   sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255
    245 //
    246 //   The value window^3 * 255 must fit in a uint32_t. So,
    247 //      window^3 < 2^32. window = 255.
    248 //
    249 //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
    250 //   For window <= 255, the largest value for sigma is 136.
    251 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
    252     : fSigmaW{SkTPin(sigmaW, 0.0, 136.0)}
    253     , fSigmaH{SkTPin(sigmaH, 0.0, 136.0)}
    254 {
    255     SkASSERT(sigmaW >= 0);
    256     SkASSERT(sigmaH >= 0);
    257 }
    258 
    259 bool SkMaskBlurFilter::hasNoBlur() const {
    260     return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
    261 }
    262 
    263 // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
    264 // Each of these converts width (up to 8) mask values to A8.
    265 static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) {
    266     SkASSERT(0 < width && width <= 8);
    267 
    268     uint8_t masks = *from;
    269     for (int i = 0; i < width; ++i) {
    270         a8[i] = (masks >> (7 - i)) & 1 ? 0xFF
    271                                        : 0x00;
    272     }
    273 }
    274 static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) {
    275     SkASSERT(0 < width && width <= 8);
    276 
    277     for (int i = 0; i < width; ++i) {
    278         unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i],
    279                    r = SkPacked16ToR32(rgb),
    280                    g = SkPacked16ToG32(rgb),
    281                    b = SkPacked16ToB32(rgb);
    282         a8[i] = (r + g + b) / 3;
    283     }
    284 }
    285 static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
    286     SkASSERT(0 < width && width <= 8);
    287     for (int i = 0; i < width; ++i) {
    288         uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i];
    289         a8[i] = SkGetPackedA32(rgba);
    290     }
    291 }
    292 using ToA8 = decltype(bw_to_a8);
    293 
    294 static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
    295     // Our fast path is a full 8-byte load of A8.
    296     // So we'll conditionally handle the two slow paths using tmp:
    297     //    - if we have a function to convert another mask to A8, use it;
    298     //    - if not but we have less than 8 bytes to load, load them one at a time.
    299     uint8_t tmp[8] = {0,0,0,0, 0,0,0,0};
    300     if (toA8) {
    301         toA8(tmp, from, width);
    302         from = tmp;
    303     } else if (width < 8) {
    304         for (int i = 0; i < width; ++i) {
    305             tmp[i] = from[i];
    306         }
    307         from = tmp;
    308     }
    309 
    310     // Load A8 and convert to 8.8 fixed-point.
    311     return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
    312 }
    313 
    314 static void store(uint8_t* to, const Sk8h& v, int width) {
    315     Sk8b b = SkNx_cast<uint8_t>(v >> 8);
    316     if (width == 8) {
    317         b.store(to);
    318     } else {
    319         uint8_t buffer[8];
    320         b.store(buffer);
    321         for (int i = 0; i < width; i++) {
    322             to[i] = buffer[i];
    323         }
    324     }
    325 };
    326 
    327 static constexpr uint16_t _____ = 0u;
    328 static constexpr uint16_t kHalf = 0x80u;
    329 
    330 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
    331 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
    332 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
    333 // resulting in a 8.8 format.
    334 //
    335 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
    336 // system is setup to minimize the number of multiplies needed.
    337 //
    338 // Explanation:
    339 //    Blurring a specific mask value is given by the following equation where D_n is the resulting
    340 // mask value and S_n is the source value. The example below is for a filter with a radius of 1
    341 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
    342 // aligned. The filter is given by G_n where n is the symmetric filter value.
    343 //
    344 //   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
    345 //
    346 // We can start the source index at an offset relative to the destination separated by the
    347 // radius. This results in a non-traditional restating of the above filter.
    348 //
    349 //  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
    350 //
    351 // If we look at three specific consecutive destinations the following equations result:
    352 //
    353 //   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
    354 //   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
    355 //   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
    356 //
    357 // In the above equations, notice that S[7] is used in all three. In particular, two values are
    358 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
    359 // D[7].
    360 //
    361 // From the point of view of a source value we end up with the following three equations.
    362 //
    363 // Given S[7]:
    364 //   D[5] += S[7]*G[1]
    365 //   D[6] += S[7]*G[0]
    366 //   D[7] += S[7]*G[1]
    367 //
    368 // In General:
    369 //   D[n]   += S[n]*G[1]
    370 //   D[n+1] += S[n]*G[0]
    371 //   D[n+2] += S[n]*G[1]
    372 //
    373 // Now these equations can be ganged using SIMD to form:
    374 //   D[n..n+7]   += S[n..n+7]*G[1]
    375 //   D[n+1..n+8] += S[n..n+7]*G[0]
    376 //   D[n+2..n+9] += S[n..n+7]*G[1]
    377 // The next set of values becomes.
    378 //   D[n+8..n+15]  += S[n+8..n+15]*G[1]
    379 //   D[n+9..n+16]  += S[n+8..n+15]*G[0]
    380 //   D[n+10..n+17] += S[n+8..n+15]*G[1]
    381 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
    382 // S[n..7] and S[n+8..n+15].
    383 //
    384 // Just one more transformation allows the code to maintain all working values in
    385 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
    386 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
    387 //
    388 //   D[n..n+7]  += S[n..n+7] * G[1]
    389 //   D[n..n+8]  += {0, S[n..n+7] * G[0]}
    390 //   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    391 //
    392 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
    393 // register d8. In addition, S[0..n+7] becomes s0.
    394 //
    395 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
    396 //
    397 // Sk8h v0 = s0*G[0]
    398 // Sk8h v1 = s0*G[1]
    399 // /* D[n..n+7]  += S[n..n+7] * G[1] */
    400 // d0 += v1;
    401 // /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
    402 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
    403 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
    404 // /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
    405 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
    406 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
    407 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
    408 
    409 static void blur_x_radius_1(
    410         const Sk8h& s0,
    411         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
    412         Sk8h* d0, Sk8h* d8) {
    413 
    414     auto v1 = s0.mulHi(g1);
    415     auto v0 = s0.mulHi(g0);
    416 
    417     // D[n..n+7]  += S[n..n+7] * G[1]
    418     *d0 += v1;
    419 
    420     //D[n..n+8]  += {0, S[n..n+7] * G[0]}
    421     *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
    422     *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
    423 
    424     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    425     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
    426     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
    427 
    428 }
    429 
    430 static void blur_x_radius_2(
    431         const Sk8h& s0,
    432         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
    433         Sk8h* d0, Sk8h* d8) {
    434     auto v0 = s0.mulHi(g0);
    435     auto v1 = s0.mulHi(g1);
    436     auto v2 = s0.mulHi(g2);
    437 
    438     // D[n..n+7]  += S[n..n+7] * G[2]
    439     *d0 += v2;
    440 
    441     // D[n..n+8]  += {0, S[n..n+7] * G[1]}
    442     *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
    443     *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
    444 
    445     // D[n..n+9]  += {0, 0, S[n..n+7] * G[0]}
    446     *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
    447     *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
    448 
    449     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
    450     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
    451     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
    452 
    453     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[2]}
    454     *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
    455     *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
    456 }
    457 
    458 static void blur_x_radius_3(
    459         const Sk8h& s0,
    460         const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
    461         Sk8h* d0, Sk8h* d8) {
    462     auto v0 = s0.mulHi(gauss0);
    463     auto v1 = s0.mulHi(gauss1);
    464     auto v2 = s0.mulHi(gauss2);
    465     auto v3 = s0.mulHi(gauss3);
    466 
    467     // D[n..n+7]  += S[n..n+7] * G[3]
    468     *d0 += v3;
    469 
    470     // D[n..n+8]  += {0, S[n..n+7] * G[2]}
    471     *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
    472     *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
    473 
    474     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
    475     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
    476     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
    477 
    478     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[0]}
    479     *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
    480     *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
    481 
    482     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[1]}
    483     *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
    484     *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
    485 
    486     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
    487     *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
    488     *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
    489 
    490     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
    491     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
    492     *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
    493 }
    494 
    495 static void blur_x_radius_4(
    496         const Sk8h& s0,
    497         const Sk8h& gauss0,
    498         const Sk8h& gauss1,
    499         const Sk8h& gauss2,
    500         const Sk8h& gauss3,
    501         const Sk8h& gauss4,
    502         Sk8h* d0, Sk8h* d8) {
    503     auto v0 = s0.mulHi(gauss0);
    504     auto v1 = s0.mulHi(gauss1);
    505     auto v2 = s0.mulHi(gauss2);
    506     auto v3 = s0.mulHi(gauss3);
    507     auto v4 = s0.mulHi(gauss4);
    508 
    509     // D[n..n+7]  += S[n..n+7] * G[4]
    510     *d0 += v4;
    511 
    512     // D[n..n+8]  += {0, S[n..n+7] * G[3]}
    513     *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
    514     *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
    515 
    516     // D[n..n+9]  += {0, 0, S[n..n+7] * G[2]}
    517     *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
    518     *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
    519 
    520     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
    521     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
    522     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
    523 
    524     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[0]}
    525     *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
    526     *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
    527 
    528     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
    529     *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
    530     *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
    531 
    532     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
    533     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
    534     *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
    535 
    536     // D[n..n+14]  += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
    537     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
    538     *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
    539 
    540     // D[n..n+15]  += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
    541     *d8 += v4;
    542 }
    543 
    544 using BlurX = decltype(blur_x_radius_1);
    545 
    546 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
    547 static void blur_row(
    548         BlurX blur,
    549         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    550         const uint8_t* src, int srcW,
    551               uint8_t* dst, int dstW) {
    552     // Clear the buffer to handle summing wider than source.
    553     Sk8h d0{kHalf}, d8{kHalf};
    554 
    555     // Go by multiples of 8 in src.
    556     int x = 0;
    557     for (; x <= srcW - 8; x += 8) {
    558         blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
    559 
    560         store(dst, d0, 8);
    561 
    562         d0 = d8;
    563         d8 = Sk8h{kHalf};
    564 
    565         src += 8;
    566         dst += 8;
    567     }
    568 
    569     // There are src values left, but the remainder of src values is not a multiple of 8.
    570     int srcTail = srcW - x;
    571     if (srcTail > 0) {
    572 
    573         blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
    574 
    575         int dstTail = std::min(8, dstW - x);
    576         store(dst, d0, dstTail);
    577 
    578         d0 = d8;
    579         dst += dstTail;
    580         x += dstTail;
    581     }
    582 
    583     // There are dst mask values to complete.
    584     int dstTail = dstW - x;
    585     if (dstTail > 0) {
    586         store(dst, d0, dstTail);
    587     }
    588 }
    589 
    590 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
    591 static void blur_x_rect(BlurX blur,
    592                         uint16_t* gauss,
    593                         const uint8_t* src, size_t srcStride, int srcW,
    594                         uint8_t* dst, size_t dstStride, int dstW, int dstH) {
    595 
    596     Sk8h g0{gauss[0]},
    597          g1{gauss[1]},
    598          g2{gauss[2]},
    599          g3{gauss[3]},
    600          g4{gauss[4]};
    601 
    602     // Blur *ALL* the rows.
    603     for (int y = 0; y < dstH; y++) {
    604         blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
    605         src += srcStride;
    606         dst += dstStride;
    607     }
    608 }
    609 
    610 static void direct_blur_x(int radius, uint16_t* gauss,
    611                           const uint8_t* src, size_t srcStride, int srcW,
    612                           uint8_t* dst, size_t dstStride, int dstW, int dstH) {
    613 
    614     switch (radius) {
    615         case 1:
    616             blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    617             break;
    618 
    619         case 2:
    620             blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    621             break;
    622 
    623         case 3:
    624             blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    625             break;
    626 
    627         case 4:
    628             blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
    629             break;
    630 
    631         default:
    632             SkASSERTF(false, "The radius %d is not handled\n", radius);
    633     }
    634 }
    635 
    636 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
    637 // functions, but end up being simpler because there is no complicated shift of registers. We
    638 // start with the non-traditional form of the gaussian filter. In the following r is the value
    639 // when added generates the next value in the column.
    640 //
    641 //   D[n+0r] = S[n+0r]*G[1]
    642 //           + S[n+1r]*G[0]
    643 //           + S[n+2r]*G[1]
    644 //
    645 // Expanding out in a way similar to blur_x_radius_N for specific values of n.
    646 //
    647 //   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
    648 //   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
    649 //   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
    650 //
    651 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
    652 // can look at the calculation form the point of view of a source value.
    653 //
    654 //   Given S[n+0r]:
    655 //   D[n+0r] += S[n+0r]*G[1];
    656 //   /* D[n+0r] is done and can be stored now. */
    657 //   D[n+1r] += S[n+0r]*G[0];
    658 //   D[n+2r]  = S[n+0r]*G[1];
    659 //
    660 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
    661 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
    662 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
    663 // how values flow in the following two iterations in source.
    664 //
    665 //   D[n+0r] += S[n+0r]*G[1]
    666 //   D[n+1r] += S[n+0r]*G[0]
    667 //   D[n+2r]  = S[n+0r]*G[1]
    668 //   /* ------- */
    669 //   D[n+1r] += S[n+1r]*G[1]
    670 //   D[n+2r] += S[n+1r]*G[0]
    671 //   D[n+3r]  = S[n+1r]*G[1]
    672 //
    673 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes
    674 // to the following.
    675 //
    676 //   answer = d01 + S[n+0r]*G[1]
    677 //   d01    = d12 + S[n+0r]*G[0]
    678 //   d12    =       S[n+0r]*G[1]
    679 //   return answer
    680 //
    681 // Finally, this can be ganged into SIMD style.
    682 //   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
    683 //   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
    684 //   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
    685 //   return answer[0..7]
    686 static Sk8h blur_y_radius_1(
    687         const Sk8h& s0,
    688         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
    689         Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
    690     auto v0 = s0.mulHi(g0);
    691     auto v1 = s0.mulHi(g1);
    692 
    693     Sk8h answer = *d01 + v1;
    694            *d01 = *d12 + v0;
    695            *d12 =        v1 + kHalf;
    696 
    697     return answer;
    698 }
    699 
    700 static Sk8h blur_y_radius_2(
    701         const Sk8h& s0,
    702         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
    703         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
    704     auto v0 = s0.mulHi(g0);
    705     auto v1 = s0.mulHi(g1);
    706     auto v2 = s0.mulHi(g2);
    707 
    708     Sk8h answer = *d01 + v2;
    709            *d01 = *d12 + v1;
    710            *d12 = *d23 + v0;
    711            *d23 = *d34 + v1;
    712            *d34 =        v2 + kHalf;
    713 
    714     return answer;
    715 }
    716 
    717 static Sk8h blur_y_radius_3(
    718         const Sk8h& s0,
    719         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
    720         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
    721     auto v0 = s0.mulHi(g0);
    722     auto v1 = s0.mulHi(g1);
    723     auto v2 = s0.mulHi(g2);
    724     auto v3 = s0.mulHi(g3);
    725 
    726     Sk8h answer = *d01 + v3;
    727            *d01 = *d12 + v2;
    728            *d12 = *d23 + v1;
    729            *d23 = *d34 + v0;
    730            *d34 = *d45 + v1;
    731            *d45 = *d56 + v2;
    732            *d56 =        v3 + kHalf;
    733 
    734     return answer;
    735 }
    736 
    737 static Sk8h blur_y_radius_4(
    738     const Sk8h& s0,
    739     const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    740     Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
    741     auto v0 = s0.mulHi(g0);
    742     auto v1 = s0.mulHi(g1);
    743     auto v2 = s0.mulHi(g2);
    744     auto v3 = s0.mulHi(g3);
    745     auto v4 = s0.mulHi(g4);
    746 
    747     Sk8h answer = *d01 + v4;
    748            *d01 = *d12 + v3;
    749            *d12 = *d23 + v2;
    750            *d23 = *d34 + v1;
    751            *d34 = *d45 + v0;
    752            *d45 = *d56 + v1;
    753            *d56 = *d67 + v2;
    754            *d67 = *d78 + v3;
    755            *d78 =        v4 + kHalf;
    756 
    757     return answer;
    758 }
    759 
    760 using BlurY = decltype(blur_y_radius_1);
    761 
    762 // BlurY will be one of blur_y_radius_(1|2|3|4).
    763 static void blur_column(
    764         ToA8 toA8,
    765         BlurY blur, int radius, int width,
    766         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
    767         const uint8_t* src, size_t srcRB, int srcH,
    768         uint8_t* dst, size_t dstRB) {
    769     Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
    770          d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
    771 
    772     auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
    773         store(to, v0, width);
    774         to += dstRB;
    775         store(to, v1, width);
    776         return to + dstRB;
    777     };
    778 
    779     for (int y = 0; y < srcH; y += 1) {
    780         auto s = load(src, width, toA8);
    781         auto b = blur(s,
    782                       g0, g1, g2, g3, g4,
    783                       &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
    784         store(dst, b, width);
    785         src += srcRB;
    786         dst += dstRB;
    787     }
    788 
    789     if (radius >= 1) {
    790         dst = flush(dst, d01, d12);
    791     }
    792     if (radius >= 2) {
    793         dst = flush(dst, d23, d34);
    794     }
    795     if (radius >= 3) {
    796         dst = flush(dst, d45, d56);
    797     }
    798     if (radius >= 4) {
    799               flush(dst, d67, d78);
    800     }
    801 }
    802 
    803 // BlurY will be one of blur_y_radius_(1|2|3|4).
    804 static void blur_y_rect(ToA8 toA8, const int strideOf8,
    805                         BlurY blur, int radius, uint16_t *gauss,
    806                         const uint8_t *src, size_t srcRB, int srcW, int srcH,
    807                         uint8_t *dst, size_t dstRB) {
    808 
    809     Sk8h g0{gauss[0]},
    810          g1{gauss[1]},
    811          g2{gauss[2]},
    812          g3{gauss[3]},
    813          g4{gauss[4]};
    814 
    815     int x = 0;
    816     for (; x <= srcW - 8; x += 8) {
    817         blur_column(toA8, blur, radius, 8,
    818                     g0, g1, g2, g3, g4,
    819                     src, srcRB, srcH,
    820                     dst, dstRB);
    821         src += strideOf8;
    822         dst += 8;
    823     }
    824 
    825     int xTail = srcW - x;
    826     if (xTail > 0) {
    827         blur_column(toA8, blur, radius, xTail,
    828                     g0, g1, g2, g3, g4,
    829                     src, srcRB, srcH,
    830                     dst, dstRB);
    831     }
    832 }
    833 
    834 static void direct_blur_y(ToA8 toA8, const int strideOf8,
    835                           int radius, uint16_t* gauss,
    836                           const uint8_t* src, size_t srcRB, int srcW, int srcH,
    837                           uint8_t* dst, size_t dstRB) {
    838 
    839     switch (radius) {
    840         case 1:
    841             blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss,
    842                         src, srcRB, srcW, srcH,
    843                         dst, dstRB);
    844             break;
    845 
    846         case 2:
    847             blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss,
    848                         src, srcRB, srcW, srcH,
    849                         dst, dstRB);
    850             break;
    851 
    852         case 3:
    853             blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss,
    854                         src, srcRB, srcW, srcH,
    855                         dst, dstRB);
    856             break;
    857 
    858         case 4:
    859             blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss,
    860                         src, srcRB, srcW, srcH,
    861                         dst, dstRB);
    862             break;
    863 
    864         default:
    865             SkASSERTF(false, "The radius %d is not handled\n", radius);
    866     }
    867 }
    868 
    869 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
    870     SkASSERT(sigmaX == sigmaY); // TODO
    871     SkASSERT(0.01 <= sigmaX && sigmaX < 2);
    872     SkASSERT(0.01 <= sigmaY && sigmaY < 2);
    873 
    874     SkGaussFilter filterX{sigmaX},
    875                   filterY{sigmaY};
    876 
    877     int radiusX = filterX.radius(),
    878         radiusY = filterY.radius();
    879 
    880     SkASSERT(radiusX <= 4 && radiusY <= 4);
    881 
    882     auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
    883         int i = 0;
    884         for (double d : filter) {
    885             factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
    886         }
    887     };
    888 
    889     uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
    890              gaussFactorsY[SkGaussFilter::kGaussArrayMax];
    891 
    892     prepareGauss(filterX, gaussFactorsX);
    893     prepareGauss(filterY, gaussFactorsY);
    894 
    895     *dst = SkMask::PrepareDestination(radiusX, radiusY, src);
    896     if (src.fImage == nullptr) {
    897         return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
    898     }
    899     if (dst->fImage == nullptr) {
    900         dst->fBounds.setEmpty();
    901         return {0, 0};
    902     }
    903 
    904     int srcW = src.fBounds.width(),
    905         srcH = src.fBounds.height();
    906 
    907     int dstW = dst->fBounds.width(),
    908         dstH = dst->fBounds.height();
    909 
    910     size_t srcRB = src.fRowBytes,
    911            dstRB = dst->fRowBytes;
    912 
    913     //TODO: handle bluring in only one direction.
    914 
    915     // Blur vertically and copy to destination.
    916     switch (src.fFormat) {
    917         case SkMask::kBW_Format:
    918             direct_blur_y(bw_to_a8, 1,
    919                           radiusY, gaussFactorsY,
    920                           src.fImage, srcRB, srcW, srcH,
    921                           dst->fImage + radiusX, dstRB);
    922             break;
    923         case SkMask::kA8_Format:
    924             direct_blur_y(nullptr, 8,
    925                           radiusY, gaussFactorsY,
    926                           src.fImage, srcRB, srcW, srcH,
    927                           dst->fImage + radiusX, dstRB);
    928             break;
    929         case SkMask::kARGB32_Format:
    930             direct_blur_y(argb32_to_a8, 32,
    931                           radiusY, gaussFactorsY,
    932                           src.fImage, srcRB, srcW, srcH,
    933                           dst->fImage + radiusX, dstRB);
    934             break;
    935         case SkMask::kLCD16_Format:
    936             direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY,
    937                           src.fImage, srcRB, srcW, srcH,
    938                           dst->fImage + radiusX, dstRB);
    939             break;
    940         default:
    941             SK_ABORT("Unhandled format.");
    942     }
    943 
    944     // Blur horizontally in place.
    945     direct_blur_x(radiusX, gaussFactorsX,
    946                   dst->fImage + radiusX,  dstRB, srcW,
    947                   dst->fImage,            dstRB, dstW, dstH);
    948 
    949     return {radiusX, radiusY};
    950 }
    951 
    952 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
    953 // API forces the sigmas to be the same.
    954 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
    955 
    956     if (fSigmaW < 2.0 && fSigmaH < 2.0) {
    957         return small_blur(fSigmaW, fSigmaH, src, dst);
    958     }
    959 
    960     // 1024 is a place holder guess until more analysis can be done.
    961     SkSTArenaAlloc<1024> alloc;
    962 
    963     PlanGauss planW(fSigmaW);
    964     PlanGauss planH(fSigmaH);
    965 
    966     int borderW = planW.border(),
    967         borderH = planH.border();
    968     SkASSERT(borderH >= 0 && borderW >= 0);
    969 
    970     *dst = SkMask::PrepareDestination(borderW, borderH, src);
    971     if (src.fImage == nullptr) {
    972         return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
    973     }
    974     if (dst->fImage == nullptr) {
    975         dst->fBounds.setEmpty();
    976         return {0, 0};
    977     }
    978 
    979     int srcW = src.fBounds.width(),
    980         srcH = src.fBounds.height(),
    981         dstW = dst->fBounds.width(),
    982         dstH = dst->fBounds.height();
    983     SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
    984 
    985     auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize());
    986     auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
    987 
    988     // Blur both directions.
    989     int tmpW = srcH,
    990         tmpH = dstW;
    991 
    992     auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
    993 
    994     // Blur horizontally, and transpose.
    995     const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer);
    996     switch (src.fFormat) {
    997         case SkMask::kBW_Format: {
    998             const uint8_t* bwStart = src.fImage;
    999             auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0);
   1000             auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8);
   1001             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
   1002                 auto tmpStart = &tmp[y];
   1003                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
   1004             }
   1005         } break;
   1006         case SkMask::kA8_Format: {
   1007             const uint8_t* a8Start = src.fImage;
   1008             auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start);
   1009             auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW);
   1010             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
   1011                 auto tmpStart = &tmp[y];
   1012                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
   1013             }
   1014         } break;
   1015         case SkMask::kARGB32_Format: {
   1016             const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage);
   1017             auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart);
   1018             auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW);
   1019             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
   1020                 auto tmpStart = &tmp[y];
   1021                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
   1022             }
   1023         } break;
   1024         case SkMask::kLCD16_Format: {
   1025             const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage);
   1026             auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart);
   1027             auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW);
   1028             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
   1029                 auto tmpStart = &tmp[y];
   1030                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
   1031             }
   1032         } break;
   1033         default:
   1034             SK_ABORT("Unhandled format.");
   1035     }
   1036 
   1037     // Blur vertically (scan in memory order because of the transposition),
   1038     // and transpose back to the original orientation.
   1039     const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer);
   1040     for (int y = 0; y < tmpH; y++) {
   1041         auto tmpStart = &tmp[y * tmpW];
   1042         auto dstStart = &dst->fImage[y];
   1043 
   1044         scanH.blur(tmpStart, tmpStart + tmpW,
   1045                    dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
   1046     }
   1047 
   1048     return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
   1049 }
   1050