Home | History | Annotate | Download | only in effects
      1 
      2 /*
      3  * Copyright 2006 The Android Open Source Project
      4  *
      5  * Use of this source code is governed by a BSD-style license that can be
      6  * found in the LICENSE file.
      7  */
      8 
      9 
     10 #include "SkBlurMask.h"
     11 #include "SkMath.h"
     12 #include "SkTemplates.h"
     13 #include "SkEndian.h"
     14 
     15 // Unrolling the integer blur kernel seems to give us a ~15% speedup on Windows,
     16 // breakeven on Mac, and ~15% slowdown on Linux.
     17 // Reading a word at a time when bulding the sum buffer seems to give
     18 // us no appreciable speedup on Windows or Mac, and 2% slowdown on Linux.
     19 #if defined(SK_BUILD_FOR_WIN32)
     20 #define UNROLL_KERNEL_LOOP 1
     21 #endif
     22 
     23 /** The sum buffer is an array of u32 to hold the accumulated sum of all of the
     24     src values at their position, plus all values above and to the left.
     25     When we sample into this buffer, we need an initial row and column of 0s,
     26     so we have an index correspondence as follows:
     27 
     28     src[i, j] == sum[i+1, j+1]
     29     sum[0, j] == sum[i, 0] == 0
     30 
     31     We assume that the sum buffer's stride == its width
     32  */
     33 static void build_sum_buffer(uint32_t sum[], int srcW, int srcH,
     34                              const uint8_t src[], int srcRB) {
     35     int sumW = srcW + 1;
     36 
     37     SkASSERT(srcRB >= srcW);
     38     // mod srcRB so we can apply it after each row
     39     srcRB -= srcW;
     40 
     41     int x, y;
     42 
     43     // zero out the top row and column
     44     memset(sum, 0, sumW * sizeof(sum[0]));
     45     sum += sumW;
     46 
     47     // special case first row
     48     uint32_t X = 0;
     49     *sum++ = 0; // initialze the first column to 0
     50     for (x = srcW - 1; x >= 0; --x) {
     51         X = *src++ + X;
     52         *sum++ = X;
     53     }
     54     src += srcRB;
     55 
     56     // now do the rest of the rows
     57     for (y = srcH - 1; y > 0; --y) {
     58         uint32_t L = 0;
     59         uint32_t C = 0;
     60         *sum++ = 0; // initialze the first column to 0
     61 
     62         for (x = srcW - 1; !SkIsAlign4((intptr_t) src) && x >= 0; x--) {
     63             uint32_t T = sum[-sumW];
     64             X = *src++ + L + T - C;
     65             *sum++ = X;
     66             L = X;
     67             C = T;
     68         }
     69 
     70         for (; x >= 4; x-=4) {
     71             uint32_t T = sum[-sumW];
     72             X = *src++ + L + T - C;
     73             *sum++ = X;
     74             L = X;
     75             C = T;
     76             T = sum[-sumW];
     77             X = *src++ + L + T - C;
     78             *sum++ = X;
     79             L = X;
     80             C = T;
     81             T = sum[-sumW];
     82             X = *src++ + L + T - C;
     83             *sum++ = X;
     84             L = X;
     85             C = T;
     86             T = sum[-sumW];
     87             X = *src++ + L + T - C;
     88             *sum++ = X;
     89             L = X;
     90             C = T;
     91         }
     92 
     93         for (; x >= 0; --x) {
     94             uint32_t T = sum[-sumW];
     95             X = *src++ + L + T - C;
     96             *sum++ = X;
     97             L = X;
     98             C = T;
     99         }
    100         src += srcRB;
    101     }
    102 }
    103 
    104 /**
    105  * This is the path for apply_kernel() to be taken when the kernel
    106  * is wider than the source image.
    107  */
    108 static void kernel_clamped(uint8_t dst[], int rx, int ry, const uint32_t sum[],
    109                            int sw, int sh) {
    110     SkASSERT(2*rx > sw);
    111 
    112     uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));
    113 
    114     int sumStride = sw + 1;
    115 
    116     int dw = sw + 2*rx;
    117     int dh = sh + 2*ry;
    118 
    119     int prev_y = -2*ry;
    120     int next_y = 1;
    121 
    122     for (int y = 0; y < dh; y++) {
    123         int py = SkClampPos(prev_y) * sumStride;
    124         int ny = SkFastMin32(next_y, sh) * sumStride;
    125 
    126         int prev_x = -2*rx;
    127         int next_x = 1;
    128 
    129         for (int x = 0; x < dw; x++) {
    130             int px = SkClampPos(prev_x);
    131             int nx = SkFastMin32(next_x, sw);
    132 
    133             uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
    134             *dst++ = SkToU8(tmp * scale >> 24);
    135 
    136             prev_x += 1;
    137             next_x += 1;
    138         }
    139 
    140         prev_y += 1;
    141         next_y += 1;
    142     }
    143 }
    144 /**
    145  *  sw and sh are the width and height of the src. Since the sum buffer
    146  *  matches that, but has an extra row and col at the beginning (with zeros),
    147  *  we can just use sw and sh as our "max" values for pinning coordinates
    148  *  when sampling into sum[][]
    149  *
    150  *  The inner loop is conceptually simple; we break it into several sections
    151  *  to improve performance. Here's the original version:
    152         for (int x = 0; x < dw; x++) {
    153             int px = SkClampPos(prev_x);
    154             int nx = SkFastMin32(next_x, sw);
    155 
    156             uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
    157             *dst++ = SkToU8(tmp * scale >> 24);
    158 
    159             prev_x += 1;
    160             next_x += 1;
    161         }
    162  *  The sections are:
    163  *     left-hand section, where prev_x is clamped to 0
    164  *     center section, where neither prev_x nor next_x is clamped
    165  *     right-hand section, where next_x is clamped to sw
    166  *  On some operating systems, the center section is unrolled for additional
    167  *  speedup.
    168 */
    169 static void apply_kernel(uint8_t dst[], int rx, int ry, const uint32_t sum[],
    170                          int sw, int sh) {
    171     if (2*rx > sw) {
    172         kernel_clamped(dst, rx, ry, sum, sw, sh);
    173         return;
    174     }
    175 
    176     uint32_t scale = (1 << 24) / ((2*rx + 1)*(2*ry + 1));
    177 
    178     int sumStride = sw + 1;
    179 
    180     int dw = sw + 2*rx;
    181     int dh = sh + 2*ry;
    182 
    183     int prev_y = -2*ry;
    184     int next_y = 1;
    185 
    186     SkASSERT(2*rx <= dw - 2*rx);
    187 
    188     for (int y = 0; y < dh; y++) {
    189         int py = SkClampPos(prev_y) * sumStride;
    190         int ny = SkFastMin32(next_y, sh) * sumStride;
    191 
    192         int prev_x = -2*rx;
    193         int next_x = 1;
    194         int x = 0;
    195 
    196         for (; x < 2*rx; x++) {
    197             SkASSERT(prev_x <= 0);
    198             SkASSERT(next_x <= sw);
    199 
    200             int px = 0;
    201             int nx = next_x;
    202 
    203             uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
    204             *dst++ = SkToU8(tmp * scale >> 24);
    205 
    206             prev_x += 1;
    207             next_x += 1;
    208         }
    209 
    210         int i0 = prev_x + py;
    211         int i1 = next_x + ny;
    212         int i2 = next_x + py;
    213         int i3 = prev_x + ny;
    214 
    215 #if UNROLL_KERNEL_LOOP
    216         for (; x < dw - 2*rx - 4; x += 4) {
    217             SkASSERT(prev_x >= 0);
    218             SkASSERT(next_x <= sw);
    219 
    220             uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    221             *dst++ = SkToU8(tmp * scale >> 24);
    222             tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    223             *dst++ = SkToU8(tmp * scale >> 24);
    224             tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    225             *dst++ = SkToU8(tmp * scale >> 24);
    226             tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    227             *dst++ = SkToU8(tmp * scale >> 24);
    228 
    229             prev_x += 4;
    230             next_x += 4;
    231         }
    232 #endif
    233 
    234         for (; x < dw - 2*rx; x++) {
    235             SkASSERT(prev_x >= 0);
    236             SkASSERT(next_x <= sw);
    237 
    238             uint32_t tmp = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    239             *dst++ = SkToU8(tmp * scale >> 24);
    240 
    241             prev_x += 1;
    242             next_x += 1;
    243         }
    244 
    245         for (; x < dw; x++) {
    246             SkASSERT(prev_x >= 0);
    247             SkASSERT(next_x > sw);
    248 
    249             int px = prev_x;
    250             int nx = sw;
    251 
    252             uint32_t tmp = sum[px+py] + sum[nx+ny] - sum[nx+py] - sum[px+ny];
    253             *dst++ = SkToU8(tmp * scale >> 24);
    254 
    255             prev_x += 1;
    256             next_x += 1;
    257         }
    258 
    259         prev_y += 1;
    260         next_y += 1;
    261     }
    262 }
    263 
    264 /**
    265  * This is the path for apply_kernel_interp() to be taken when the kernel
    266  * is wider than the source image.
    267  */
    268 static void kernel_interp_clamped(uint8_t dst[], int rx, int ry,
    269                 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) {
    270     SkASSERT(2*rx > sw);
    271 
    272     int inner_weight = 255 - outer_weight;
    273 
    274     // round these guys up if they're bigger than 127
    275     outer_weight += outer_weight >> 7;
    276     inner_weight += inner_weight >> 7;
    277 
    278     uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1));
    279     uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1));
    280 
    281     int sumStride = sw + 1;
    282 
    283     int dw = sw + 2*rx;
    284     int dh = sh + 2*ry;
    285 
    286     int prev_y = -2*ry;
    287     int next_y = 1;
    288 
    289     for (int y = 0; y < dh; y++) {
    290         int py = SkClampPos(prev_y) * sumStride;
    291         int ny = SkFastMin32(next_y, sh) * sumStride;
    292 
    293         int ipy = SkClampPos(prev_y + 1) * sumStride;
    294         int iny = SkClampMax(next_y - 1, sh) * sumStride;
    295 
    296         int prev_x = -2*rx;
    297         int next_x = 1;
    298 
    299         for (int x = 0; x < dw; x++) {
    300             int px = SkClampPos(prev_x);
    301             int nx = SkFastMin32(next_x, sw);
    302 
    303             int ipx = SkClampPos(prev_x + 1);
    304             int inx = SkClampMax(next_x - 1, sw);
    305 
    306             uint32_t outer_sum = sum[px+py] + sum[nx+ny]
    307                                - sum[nx+py] - sum[px+ny];
    308             uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
    309                                - sum[inx+ipy] - sum[ipx+iny];
    310             *dst++ = SkToU8((outer_sum * outer_scale
    311                            + inner_sum * inner_scale) >> 24);
    312 
    313             prev_x += 1;
    314             next_x += 1;
    315         }
    316         prev_y += 1;
    317         next_y += 1;
    318     }
    319 }
    320 
    321 /**
    322  *  sw and sh are the width and height of the src. Since the sum buffer
    323  *  matches that, but has an extra row and col at the beginning (with zeros),
    324  *  we can just use sw and sh as our "max" values for pinning coordinates
    325  *  when sampling into sum[][]
    326  *
    327  *  The inner loop is conceptually simple; we break it into several variants
    328  *  to improve performance. Here's the original version:
    329         for (int x = 0; x < dw; x++) {
    330             int px = SkClampPos(prev_x);
    331             int nx = SkFastMin32(next_x, sw);
    332 
    333             int ipx = SkClampPos(prev_x + 1);
    334             int inx = SkClampMax(next_x - 1, sw);
    335 
    336             uint32_t outer_sum = sum[px+py] + sum[nx+ny]
    337                                - sum[nx+py] - sum[px+ny];
    338             uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
    339                                - sum[inx+ipy] - sum[ipx+iny];
    340             *dst++ = SkToU8((outer_sum * outer_scale
    341                            + inner_sum * inner_scale) >> 24);
    342 
    343             prev_x += 1;
    344             next_x += 1;
    345         }
    346  *  The sections are:
    347  *     left-hand section, where prev_x is clamped to 0
    348  *     center section, where neither prev_x nor next_x is clamped
    349  *     right-hand section, where next_x is clamped to sw
    350  *  On some operating systems, the center section is unrolled for additional
    351  *  speedup.
    352 */
    353 static void apply_kernel_interp(uint8_t dst[], int rx, int ry,
    354                 const uint32_t sum[], int sw, int sh, U8CPU outer_weight) {
    355     SkASSERT(rx > 0 && ry > 0);
    356     SkASSERT(outer_weight <= 255);
    357 
    358     if (2*rx > sw) {
    359         kernel_interp_clamped(dst, rx, ry, sum, sw, sh, outer_weight);
    360         return;
    361     }
    362 
    363     int inner_weight = 255 - outer_weight;
    364 
    365     // round these guys up if they're bigger than 127
    366     outer_weight += outer_weight >> 7;
    367     inner_weight += inner_weight >> 7;
    368 
    369     uint32_t outer_scale = (outer_weight << 16) / ((2*rx + 1)*(2*ry + 1));
    370     uint32_t inner_scale = (inner_weight << 16) / ((2*rx - 1)*(2*ry - 1));
    371 
    372     int sumStride = sw + 1;
    373 
    374     int dw = sw + 2*rx;
    375     int dh = sh + 2*ry;
    376 
    377     int prev_y = -2*ry;
    378     int next_y = 1;
    379 
    380     SkASSERT(2*rx <= dw - 2*rx);
    381 
    382     for (int y = 0; y < dh; y++) {
    383         int py = SkClampPos(prev_y) * sumStride;
    384         int ny = SkFastMin32(next_y, sh) * sumStride;
    385 
    386         int ipy = SkClampPos(prev_y + 1) * sumStride;
    387         int iny = SkClampMax(next_y - 1, sh) * sumStride;
    388 
    389         int prev_x = -2*rx;
    390         int next_x = 1;
    391         int x = 0;
    392 
    393         for (; x < 2*rx; x++) {
    394             SkASSERT(prev_x < 0);
    395             SkASSERT(next_x <= sw);
    396 
    397             int px = 0;
    398             int nx = next_x;
    399 
    400             int ipx = 0;
    401             int inx = next_x - 1;
    402 
    403             uint32_t outer_sum = sum[px+py] + sum[nx+ny]
    404                                - sum[nx+py] - sum[px+ny];
    405             uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
    406                                - sum[inx+ipy] - sum[ipx+iny];
    407             *dst++ = SkToU8((outer_sum * outer_scale
    408                            + inner_sum * inner_scale) >> 24);
    409 
    410             prev_x += 1;
    411             next_x += 1;
    412         }
    413 
    414         int i0 = prev_x + py;
    415         int i1 = next_x + ny;
    416         int i2 = next_x + py;
    417         int i3 = prev_x + ny;
    418         int i4 = prev_x + 1 + ipy;
    419         int i5 = next_x - 1 + iny;
    420         int i6 = next_x - 1 + ipy;
    421         int i7 = prev_x + 1 + iny;
    422 
    423 #if UNROLL_KERNEL_LOOP
    424         for (; x < dw - 2*rx - 4; x += 4) {
    425             SkASSERT(prev_x >= 0);
    426             SkASSERT(next_x <= sw);
    427 
    428             uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    429             uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
    430             *dst++ = SkToU8((outer_sum * outer_scale
    431                            + inner_sum * inner_scale) >> 24);
    432             outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    433             inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
    434             *dst++ = SkToU8((outer_sum * outer_scale
    435                            + inner_sum * inner_scale) >> 24);
    436             outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    437             inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
    438             *dst++ = SkToU8((outer_sum * outer_scale
    439                            + inner_sum * inner_scale) >> 24);
    440             outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    441             inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
    442             *dst++ = SkToU8((outer_sum * outer_scale
    443                            + inner_sum * inner_scale) >> 24);
    444 
    445             prev_x += 4;
    446             next_x += 4;
    447         }
    448 #endif
    449 
    450         for (; x < dw - 2*rx; x++) {
    451             SkASSERT(prev_x >= 0);
    452             SkASSERT(next_x <= sw);
    453 
    454             uint32_t outer_sum = sum[i0++] + sum[i1++] - sum[i2++] - sum[i3++];
    455             uint32_t inner_sum = sum[i4++] + sum[i5++] - sum[i6++] - sum[i7++];
    456             *dst++ = SkToU8((outer_sum * outer_scale
    457                            + inner_sum * inner_scale) >> 24);
    458 
    459             prev_x += 1;
    460             next_x += 1;
    461         }
    462 
    463         for (; x < dw; x++) {
    464             SkASSERT(prev_x >= 0);
    465             SkASSERT(next_x > sw);
    466 
    467             int px = prev_x;
    468             int nx = sw;
    469 
    470             int ipx = prev_x + 1;
    471             int inx = sw;
    472 
    473             uint32_t outer_sum = sum[px+py] + sum[nx+ny]
    474                                - sum[nx+py] - sum[px+ny];
    475             uint32_t inner_sum = sum[ipx+ipy] + sum[inx+iny]
    476                                - sum[inx+ipy] - sum[ipx+iny];
    477             *dst++ = SkToU8((outer_sum * outer_scale
    478                            + inner_sum * inner_scale) >> 24);
    479 
    480             prev_x += 1;
    481             next_x += 1;
    482         }
    483 
    484         prev_y += 1;
    485         next_y += 1;
    486     }
    487 }
    488 
    489 #include "SkColorPriv.h"
    490 
    491 static void merge_src_with_blur(uint8_t dst[], int dstRB,
    492                                 const uint8_t src[], int srcRB,
    493                                 const uint8_t blur[], int blurRB,
    494                                 int sw, int sh) {
    495     dstRB -= sw;
    496     srcRB -= sw;
    497     blurRB -= sw;
    498     while (--sh >= 0) {
    499         for (int x = sw - 1; x >= 0; --x) {
    500             *dst = SkToU8(SkAlphaMul(*blur, SkAlpha255To256(*src)));
    501             dst += 1;
    502             src += 1;
    503             blur += 1;
    504         }
    505         dst += dstRB;
    506         src += srcRB;
    507         blur += blurRB;
    508     }
    509 }
    510 
    511 static void clamp_with_orig(uint8_t dst[], int dstRowBytes,
    512                             const uint8_t src[], int srcRowBytes,
    513                             int sw, int sh,
    514                             SkBlurMask::Style style) {
    515     int x;
    516     while (--sh >= 0) {
    517         switch (style) {
    518         case SkBlurMask::kSolid_Style:
    519             for (x = sw - 1; x >= 0; --x) {
    520                 int s = *src;
    521                 int d = *dst;
    522                 *dst = SkToU8(s + d - SkMulDiv255Round(s, d));
    523                 dst += 1;
    524                 src += 1;
    525             }
    526             break;
    527         case SkBlurMask::kOuter_Style:
    528             for (x = sw - 1; x >= 0; --x) {
    529                 if (*src) {
    530                     *dst = SkToU8(SkAlphaMul(*dst, SkAlpha255To256(255 - *src)));
    531                 }
    532                 dst += 1;
    533                 src += 1;
    534             }
    535             break;
    536         default:
    537             SkDEBUGFAIL("Unexpected blur style here");
    538             break;
    539         }
    540         dst += dstRowBytes - sw;
    541         src += srcRowBytes - sw;
    542     }
    543 }
    544 
    545 ///////////////////////////////////////////////////////////////////////////////
    546 
    547 // we use a local funciton to wrap the class static method to work around
    548 // a bug in gcc98
    549 void SkMask_FreeImage(uint8_t* image);
    550 void SkMask_FreeImage(uint8_t* image) {
    551     SkMask::FreeImage(image);
    552 }
    553 
    554 bool SkBlurMask::Blur(SkMask* dst, const SkMask& src,
    555                       SkScalar radius, Style style, Quality quality,
    556                       SkIPoint* margin)
    557 {
    558     if (src.fFormat != SkMask::kA8_Format) {
    559         return false;
    560     }
    561 
    562     // Force high quality off for small radii (performance)
    563     if (radius < SkIntToScalar(3)) quality = kLow_Quality;
    564 
    565     // highQuality: use three box blur passes as a cheap way to approximate a Gaussian blur
    566     int passCount = (quality == kHigh_Quality) ? 3 : 1;
    567     SkScalar passRadius = SkScalarDiv(radius, SkScalarSqrt(SkIntToScalar(passCount)));
    568 
    569     int rx = SkScalarCeil(passRadius);
    570     int outer_weight = 255 - SkScalarRound((SkIntToScalar(rx) - passRadius) * 255);
    571 
    572     SkASSERT(rx >= 0);
    573     SkASSERT((unsigned)outer_weight <= 255);
    574     if (rx <= 0) {
    575         return false;
    576     }
    577 
    578     int ry = rx;    // only do square blur for now
    579 
    580     int padx = passCount * rx;
    581     int pady = passCount * ry;
    582     if (margin) {
    583         margin->set(padx, pady);
    584     }
    585     dst->fBounds.set(src.fBounds.fLeft - padx, src.fBounds.fTop - pady,
    586         src.fBounds.fRight + padx, src.fBounds.fBottom + pady);
    587     dst->fRowBytes = dst->fBounds.width();
    588     dst->fFormat = SkMask::kA8_Format;
    589     dst->fImage = NULL;
    590 
    591     if (src.fImage) {
    592         size_t dstSize = dst->computeImageSize();
    593         if (0 == dstSize) {
    594             return false;   // too big to allocate, abort
    595         }
    596 
    597         int             sw = src.fBounds.width();
    598         int             sh = src.fBounds.height();
    599         const uint8_t*  sp = src.fImage;
    600         uint8_t*        dp = SkMask::AllocImage(dstSize);
    601 
    602         SkAutoTCallVProc<uint8_t, SkMask_FreeImage> autoCall(dp);
    603 
    604         // build the blurry destination
    605         {
    606             const size_t storageW = sw + 2 * (passCount - 1) * rx + 1;
    607             const size_t storageH = sh + 2 * (passCount - 1) * ry + 1;
    608             SkAutoTMalloc<uint32_t> storage(storageW * storageH);
    609             uint32_t*               sumBuffer = storage.get();
    610 
    611             //pass1: sp is source, dp is destination
    612             build_sum_buffer(sumBuffer, sw, sh, sp, src.fRowBytes);
    613             if (outer_weight == 255) {
    614                 apply_kernel(dp, rx, ry, sumBuffer, sw, sh);
    615             } else {
    616                 apply_kernel_interp(dp, rx, ry, sumBuffer, sw, sh, outer_weight);
    617             }
    618 
    619             if (quality == kHigh_Quality) {
    620                 //pass2: dp is source, tmpBuffer is destination
    621                 int tmp_sw = sw + 2 * rx;
    622                 int tmp_sh = sh + 2 * ry;
    623                 SkAutoTMalloc<uint8_t>  tmpBuffer(dstSize);
    624                 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, dp, tmp_sw);
    625                 if (outer_weight == 255)
    626                     apply_kernel(tmpBuffer.get(), rx, ry, sumBuffer, tmp_sw, tmp_sh);
    627                 else
    628                     apply_kernel_interp(tmpBuffer.get(), rx, ry, sumBuffer,
    629                                         tmp_sw, tmp_sh, outer_weight);
    630 
    631                 //pass3: tmpBuffer is source, dp is destination
    632                 tmp_sw += 2 * rx;
    633                 tmp_sh += 2 * ry;
    634                 build_sum_buffer(sumBuffer, tmp_sw, tmp_sh, tmpBuffer.get(), tmp_sw);
    635                 if (outer_weight == 255)
    636                     apply_kernel(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh);
    637                 else
    638                     apply_kernel_interp(dp, rx, ry, sumBuffer, tmp_sw, tmp_sh,
    639                                         outer_weight);
    640             }
    641         }
    642 
    643         dst->fImage = dp;
    644         // if need be, alloc the "real" dst (same size as src) and copy/merge
    645         // the blur into it (applying the src)
    646         if (style == kInner_Style) {
    647             // now we allocate the "real" dst, mirror the size of src
    648             size_t srcSize = src.computeImageSize();
    649             if (0 == srcSize) {
    650                 return false;   // too big to allocate, abort
    651             }
    652             dst->fImage = SkMask::AllocImage(srcSize);
    653             merge_src_with_blur(dst->fImage, src.fRowBytes,
    654                                 sp, src.fRowBytes,
    655                                 dp + passCount * (rx + ry * dst->fRowBytes),
    656                                 dst->fRowBytes, sw, sh);
    657             SkMask::FreeImage(dp);
    658         } else if (style != kNormal_Style) {
    659             clamp_with_orig(dp + passCount * (rx + ry * dst->fRowBytes),
    660                             dst->fRowBytes, sp, src.fRowBytes, sw, sh, style);
    661         }
    662         (void)autoCall.detach();
    663     }
    664 
    665     if (style == kInner_Style) {
    666         dst->fBounds = src.fBounds; // restore trimmed bounds
    667         dst->fRowBytes = src.fRowBytes;
    668     }
    669 
    670     return true;
    671 }
    672 
    673