1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkArenaAlloc.h" 9 #include "SkGaussFilter.h" 10 #include "SkMalloc.h" 11 #include "SkMaskBlurFilter.h" 12 #include "SkNx.h" 13 #include "SkSafeMath.h" 14 15 #include <cmath> 16 #include <climits> 17 18 static const double kPi = 3.14159265358979323846264338327950288; 19 20 class BlurScanInterface { 21 public: 22 virtual ~BlurScanInterface() = default; 23 virtual void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd, 24 uint8_t* dst, int dstStride, uint8_t* dstEnd) const = 0; 25 virtual bool canBlur4() { return false; } 26 virtual void blur4Transpose( 27 const uint8_t* src, int srcStride, const uint8_t* srcEnd, 28 uint8_t* dst, int dstStride, uint8_t* dstEnd) const { 29 SK_ABORT("This should not be called."); 30 } 31 }; 32 33 class PlanningInterface { 34 public: 35 virtual ~PlanningInterface() = default; 36 virtual size_t bufferSize() const = 0; 37 virtual int border() const = 0; 38 virtual bool needsBlur() const = 0; 39 virtual BlurScanInterface* makeBlurScan( 40 SkArenaAlloc* alloc, int width, uint32_t* buffer) const = 0; 41 }; 42 43 class None final : public PlanningInterface { 44 public: 45 None() = default; 46 size_t bufferSize() const override { return 0; } 47 int border() const override { return 0; } 48 bool needsBlur() const override { return false; } 49 BlurScanInterface* makeBlurScan( 50 SkArenaAlloc* alloc, int width, uint32_t* buffer) const override { 51 SK_ABORT("Should never be called."); 52 return nullptr; 53 } 54 }; 55 56 class PlanGauss final : public PlanningInterface { 57 public: 58 explicit PlanGauss(double sigma) { 59 auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)); 60 auto window = std::max(1, possibleWindow); 61 62 fPass0Size = window - 1; 63 fPass1Size = window - 1; 64 fPass2Size = (window & 1) == 1 ? window - 1 : window; 65 66 // Calculating the border is tricky. I will go through the odd case which is simpler, and 67 // then through the even case. Given a stack of filters seven wide for the odd case of 68 // three passes. 69 // 70 // S 71 // aaaAaaa 72 // bbbBbbb 73 // cccCccc 74 // D 75 // 76 // The furthest changed pixel is when the filters are in the following configuration. 77 // 78 // S 79 // aaaAaaa 80 // bbbBbbb 81 // cccCccc 82 // D 83 // 84 // The A pixel is calculated using the value S, the B uses A, and the C uses B, and 85 // finally D is C. So, with a window size of seven the border is nine. In general, the 86 // border is 3*((window - 1)/2). 87 // 88 // For even cases the filter stack is more complicated. The spec specifies two passes 89 // of even filters and a final pass of odd filters. A stack for a width of six looks like 90 // this. 91 // 92 // S 93 // aaaAaa 94 // bbBbbb 95 // cccCccc 96 // D 97 // 98 // The furthest pixel looks like this. 99 // 100 // S 101 // aaaAaa 102 // bbBbbb 103 // cccCccc 104 // D 105 // 106 // For a window of size, the border value is seven. In general the border is 3 * 107 // (window/2) -1. 108 fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1; 109 fSlidingWindow = 2 * fBorder + 1; 110 111 // If the window is odd then the divisor is just window ^ 3 otherwise, 112 // it is window * window * (window + 1) = window ^ 2 + window ^ 3; 113 auto window2 = window * window; 114 auto window3 = window2 * window; 115 auto divisor = (window & 1) == 1 ? window3 : window3 + window2; 116 117 fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32))); 118 } 119 120 size_t bufferSize() const override { return fPass0Size + fPass1Size + fPass2Size; } 121 122 int border() const override { return fBorder; } 123 124 bool needsBlur() const override { return true; } 125 126 BlurScanInterface* makeBlurScan( 127 SkArenaAlloc* alloc, int width, uint32_t* buffer) const override 128 { 129 uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End; 130 buffer0 = buffer; 131 buffer0End = buffer1 = buffer0 + fPass0Size; 132 buffer1End = buffer2 = buffer1 + fPass1Size; 133 buffer2End = buffer2 + fPass2Size; 134 int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0; 135 136 return alloc->make<Gauss>( 137 fWeight, noChangeCount, 138 buffer0, buffer0End, 139 buffer1, buffer1End, 140 buffer2, buffer2End); 141 } 142 143 public: 144 class Gauss final : public BlurScanInterface { 145 public: 146 Gauss(uint64_t weight, int noChangeCount, 147 uint32_t* buffer0, uint32_t* buffer0End, 148 uint32_t* buffer1, uint32_t* buffer1End, 149 uint32_t* buffer2, uint32_t* buffer2End) 150 : fWeight{weight} 151 , fNoChangeCount{noChangeCount} 152 , fBuffer0{buffer0} 153 , fBuffer0End{buffer0End} 154 , fBuffer1{buffer1} 155 , fBuffer1End{buffer1End} 156 , fBuffer2{buffer2} 157 , fBuffer2End{buffer2End} 158 { } 159 160 void blur(const uint8_t* src, int srcStride, const uint8_t* srcEnd, 161 uint8_t* dst, int dstStride, uint8_t* dstEnd) const override { 162 auto buffer0Cursor = fBuffer0; 163 auto buffer1Cursor = fBuffer1; 164 auto buffer2Cursor = fBuffer2; 165 166 std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); 167 168 uint32_t sum0 = 0; 169 uint32_t sum1 = 0; 170 uint32_t sum2 = 0; 171 172 // Consume the source generating pixels. 173 for (auto srcCursor = src; 174 srcCursor < srcEnd; dst += dstStride, srcCursor += srcStride) { 175 uint32_t leadingEdge = *srcCursor; 176 sum0 += leadingEdge; 177 sum1 += sum0; 178 sum2 += sum1; 179 180 *dst = this->finalScale(sum2); 181 182 sum2 -= *buffer2Cursor; 183 *buffer2Cursor = sum1; 184 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; 185 186 sum1 -= *buffer1Cursor; 187 *buffer1Cursor = sum0; 188 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; 189 190 sum0 -= *buffer0Cursor; 191 *buffer0Cursor = leadingEdge; 192 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; 193 } 194 195 // The leading edge is off the right side of the mask. 196 for (int i = 0; i < fNoChangeCount; i++) { 197 uint32_t leadingEdge = 0; 198 sum0 += leadingEdge; 199 sum1 += sum0; 200 sum2 += sum1; 201 202 *dst = this->finalScale(sum2); 203 204 sum2 -= *buffer2Cursor; 205 *buffer2Cursor = sum1; 206 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; 207 208 sum1 -= *buffer1Cursor; 209 *buffer1Cursor = sum0; 210 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; 211 212 sum0 -= *buffer0Cursor; 213 *buffer0Cursor = leadingEdge; 214 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; 215 216 dst += dstStride; 217 } 218 219 // Starting from the right, fill in the rest of the buffer. 220 std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0)); 221 222 sum0 = sum1 = sum2 = 0; 223 224 uint8_t* dstCursor = dstEnd; 225 const uint8_t* srcCursor = srcEnd; 226 while (dstCursor > dst) { 227 dstCursor -= dstStride; 228 srcCursor -= srcStride; 229 uint32_t leadingEdge = *srcCursor; 230 sum0 += leadingEdge; 231 sum1 += sum0; 232 sum2 += sum1; 233 234 *dstCursor = this->finalScale(sum2); 235 236 sum2 -= *buffer2Cursor; 237 *buffer2Cursor = sum1; 238 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2; 239 240 sum1 -= *buffer1Cursor; 241 *buffer1Cursor = sum0; 242 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1; 243 244 sum0 -= *buffer0Cursor; 245 *buffer0Cursor = leadingEdge; 246 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0; 247 } 248 } 249 250 private: 251 static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31; 252 253 uint8_t finalScale(uint32_t sum) const { 254 return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32); 255 } 256 257 uint64_t fWeight; 258 int fNoChangeCount; 259 uint32_t* fBuffer0; 260 uint32_t* fBuffer0End; 261 uint32_t* fBuffer1; 262 uint32_t* fBuffer1End; 263 uint32_t* fBuffer2; 264 uint32_t* fBuffer2End; 265 }; 266 267 uint64_t fWeight; 268 int fBorder; 269 int fSlidingWindow; 270 int fPass0Size; 271 int fPass1Size; 272 int fPass2Size; 273 }; 274 275 // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow 276 // using the Gauss filter. It also limits the size of buffers used hold intermediate values. 277 // Explanation of maximums: 278 // sum0 = window * 255 279 // sum1 = window * sum0 -> window * window * 255 280 // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255 281 // 282 // The value window^3 * 255 must fit in a uint32_t. So, 283 // window^3 < 2^32. window = 255. 284 // 285 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5) 286 // For window <= 255, the largest value for sigma is 136. 287 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH) 288 : fSigmaW{SkTPin(sigmaW, 0.0, 136.0)} 289 , fSigmaH{SkTPin(sigmaH, 0.0, 136.0)} 290 { 291 SkASSERT(sigmaW >= 0); 292 SkASSERT(sigmaH >= 0); 293 } 294 295 bool SkMaskBlurFilter::hasNoBlur() const { 296 return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1); 297 } 298 299 static SkMask prepare_destination(int radiusX, int radiusY, const SkMask& src) { 300 SkSafeMath safe; 301 302 SkMask dst; 303 // dstW = srcW + 2 * radiusX; 304 size_t dstW = safe.add(src.fBounds.width(), safe.add(radiusX, radiusX)); 305 // dstH = srcH + 2 * radiusY; 306 size_t dstH = safe.add(src.fBounds.height(), safe.add(radiusY, radiusY)); 307 308 dst.fBounds.set(0, 0, SkTo<int>(dstW), SkTo<int>(dstH)); 309 dst.fBounds.offset(src.fBounds.x(), src.fBounds.y()); 310 dst.fBounds.offset(-radiusX, -radiusY); 311 312 dst.fImage = nullptr; 313 dst.fRowBytes = SkTo<uint32_t>(dstW); 314 dst.fFormat = SkMask::kA8_Format; 315 316 size_t toAlloc = safe.mul(dstW, dstH); 317 318 if (safe && src.fImage != nullptr) { 319 dst.fImage = SkMask::AllocImage(toAlloc); 320 } 321 322 return dst; 323 } 324 325 static constexpr uint16_t _____ = 0u; 326 static constexpr uint16_t kHalf = 0x80u; 327 328 static SK_ALWAYS_INLINE Sk8h load(const uint8_t* from, int width) { 329 uint8_t buffer[8]; 330 if (width < 8) { 331 sk_bzero(buffer, sizeof(buffer)); 332 for (int i = 0; i < width; i++) { 333 buffer[i] = from[i]; 334 } 335 from = buffer; 336 } 337 auto v = SkNx_cast<uint16_t>(Sk8b::Load(from)); 338 // Convert from 0-255 to 8.8 encoding. 339 return v << 8; 340 }; 341 342 static SK_ALWAYS_INLINE void store(uint8_t* to, const Sk8h& v, int width) { 343 Sk8b b = SkNx_cast<uint8_t>(v >> 8); 344 if (width == 8) { 345 b.store(to); 346 } else { 347 uint8_t buffer[8]; 348 b.store(buffer); 349 for (int i = 0; i < width; i++) { 350 to[i] = buffer[i]; 351 } 352 } 353 }; 354 355 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded 356 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8 357 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits 358 // resulting in a 8.8 format. 359 // 360 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This 361 // system is setup to minimize the number of multiplies needed. 362 // 363 // Explanation: 364 // Blurring a specific mask value is given by the following equation where D_n is the resulting 365 // mask value and S_n is the source value. The example below is for a filter with a radius of 1 366 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are 367 // aligned. The filter is given by G_n where n is the symmetric filter value. 368 // 369 // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1]. 370 // 371 // We can start the source index at an offset relative to the destination separated by the 372 // radius. This results in a non-traditional restating of the above filter. 373 // 374 // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1] 375 // 376 // If we look at three specific consecutive destinations the following equations result: 377 // 378 // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1] 379 // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1] 380 // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1]. 381 // 382 // In the above equations, notice that S[7] is used in all three. In particular, two values are 383 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and 384 // D[7]. 385 // 386 // From the point of view of a source value we end up with the following three equations. 387 // 388 // Given S[7]: 389 // D[5] += S[7]*G[1] 390 // D[6] += S[7]*G[0] 391 // D[7] += S[7]*G[1] 392 // 393 // In General: 394 // D[n] += S[n]*G[1] 395 // D[n+1] += S[n]*G[0] 396 // D[n+2] += S[n]*G[1] 397 // 398 // Now these equations can be ganged using SIMD to form: 399 // D[n..n+7] += S[n..n+7]*G[1] 400 // D[n+1..n+8] += S[n..n+7]*G[0] 401 // D[n+2..n+9] += S[n..n+7]*G[1] 402 // The next set of values becomes. 403 // D[n+8..n+15] += S[n+8..n+15]*G[1] 404 // D[n+9..n+16] += S[n+8..n+15]*G[0] 405 // D[n+10..n+17] += S[n+8..n+15]*G[1] 406 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both 407 // S[n..7] and S[n+8..n+15]. 408 // 409 // Just one more transformation allows the code to maintain all working values in 410 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is 411 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}. 412 // 413 // D[n..n+7] += S[n..n+7] * G[1] 414 // D[n..n+8] += {0, S[n..n+7] * G[0]} 415 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} 416 // 417 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a 418 // register d8. In addition, S[0..n+7] becomes s0. 419 // 420 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below. 421 // 422 // Sk8h v0 = s0*G[0] 423 // Sk8h v1 = s0*G[1] 424 // /* D[n..n+7] += S[n..n+7] * G[1] */ 425 // d0 += v1; 426 // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */ 427 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]} 428 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____} 429 // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */ 430 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]} 431 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} 432 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. 433 434 static SK_ALWAYS_INLINE void blur_x_radius_1( 435 const Sk8h& s0, 436 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, 437 Sk8h* d0, Sk8h* d8) { 438 439 auto v1 = s0.mulHi(g1); 440 auto v0 = s0.mulHi(g0); 441 442 // D[n..n+7] += S[n..n+7] * G[1] 443 *d0 += v1; 444 445 //D[n..n+8] += {0, S[n..n+7] * G[0]} 446 *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}; 447 *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____}; 448 449 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} 450 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; 451 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; 452 453 } 454 455 static SK_ALWAYS_INLINE void blur_x_radius_2( 456 const Sk8h& s0, 457 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, 458 Sk8h* d0, Sk8h* d8) { 459 auto v0 = s0.mulHi(g0); 460 auto v1 = s0.mulHi(g1); 461 auto v2 = s0.mulHi(g2); 462 463 // D[n..n+7] += S[n..n+7] * G[2] 464 *d0 += v2; 465 466 // D[n..n+8] += {0, S[n..n+7] * G[1]} 467 *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]}; 468 *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____}; 469 470 // D[n..n+9] += {0, 0, S[n..n+7] * G[0]} 471 *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]}; 472 *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____}; 473 474 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} 475 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; 476 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; 477 478 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[2]} 479 *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]}; 480 *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____}; 481 } 482 483 static SK_ALWAYS_INLINE void blur_x_radius_3( 484 const Sk8h& s0, 485 const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&, 486 Sk8h* d0, Sk8h* d8) { 487 auto v0 = s0.mulHi(gauss0); 488 auto v1 = s0.mulHi(gauss1); 489 auto v2 = s0.mulHi(gauss2); 490 auto v3 = s0.mulHi(gauss3); 491 492 // D[n..n+7] += S[n..n+7] * G[3] 493 *d0 += v3; 494 495 // D[n..n+8] += {0, S[n..n+7] * G[2]} 496 *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]}; 497 *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____}; 498 499 // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} 500 *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}; 501 *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____}; 502 503 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[0]} 504 *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]}; 505 *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____}; 506 507 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[1]} 508 *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]}; 509 *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____}; 510 511 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[2]} 512 *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]}; 513 *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____}; 514 515 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} 516 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]}; 517 *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____}; 518 } 519 520 static SK_ALWAYS_INLINE void blur_x_radius_4( 521 const Sk8h& s0, 522 const Sk8h& gauss0, 523 const Sk8h& gauss1, 524 const Sk8h& gauss2, 525 const Sk8h& gauss3, 526 const Sk8h& gauss4, 527 Sk8h* d0, Sk8h* d8) { 528 auto v0 = s0.mulHi(gauss0); 529 auto v1 = s0.mulHi(gauss1); 530 auto v2 = s0.mulHi(gauss2); 531 auto v3 = s0.mulHi(gauss3); 532 auto v4 = s0.mulHi(gauss4); 533 534 // D[n..n+7] += S[n..n+7] * G[4] 535 *d0 += v4; 536 537 // D[n..n+8] += {0, S[n..n+7] * G[3]} 538 *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]}; 539 *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____}; 540 541 // D[n..n+9] += {0, 0, S[n..n+7] * G[2]} 542 *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]}; 543 *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____}; 544 545 // D[n..n+10] += {0, 0, 0, S[n..n+7] * G[1]} 546 *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]}; 547 *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____}; 548 549 // D[n..n+11] += {0, 0, 0, 0, S[n..n+7] * G[0]} 550 *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]}; 551 *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____}; 552 553 // D[n..n+12] += {0, 0, 0, 0, 0, S[n..n+7] * G[1]} 554 *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]}; 555 *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____}; 556 557 // D[n..n+13] += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]} 558 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]}; 559 *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____}; 560 561 // D[n..n+14] += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]} 562 *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]}; 563 *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____}; 564 565 // D[n..n+15] += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]} 566 *d8 += v4; 567 } 568 569 using BlurX = decltype(blur_x_radius_1); 570 571 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). 572 static SK_ALWAYS_INLINE void blur_row( 573 BlurX blur, 574 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, 575 const uint8_t* src, int srcW, 576 uint8_t* dst, int dstW) { 577 // Clear the buffer to handle summing wider than source. 578 Sk8h d0{kHalf}, d8{kHalf}; 579 580 // Go by multiples of 8 in src. 581 int x = 0; 582 for (; x <= srcW - 8; x += 8) { 583 blur(load(src, 8), g0, g1, g2, g3, g4, &d0, &d8); 584 585 store(dst, d0, 8); 586 587 d0 = d8; 588 d8 = Sk8h{kHalf}; 589 590 src += 8; 591 dst += 8; 592 } 593 594 // There are src values left, but the remainder of src values is not a multiple of 8. 595 int srcTail = srcW - x; 596 if (srcTail > 0) { 597 598 blur(load(src, srcTail), g0, g1, g2, g3, g4, &d0, &d8); 599 600 int dstTail = std::min(8, dstW - x); 601 store(dst, d0, dstTail); 602 603 d0 = d8; 604 dst += dstTail; 605 x += dstTail; 606 } 607 608 // There are dst mask values to complete. 609 int dstTail = dstW - x; 610 if (dstTail > 0) { 611 store(dst, d0, dstTail); 612 } 613 } 614 615 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). 616 static SK_ALWAYS_INLINE void blur_x_rect( 617 BlurX blur, 618 uint16_t* gauss, 619 const uint8_t* src, size_t srcStride, int srcW, 620 uint8_t* dst, size_t dstStride, int dstW, int dstH) { 621 622 Sk8h g0{gauss[0]}, 623 g1{gauss[1]}, 624 g2{gauss[2]}, 625 g3{gauss[3]}, 626 g4{gauss[4]}; 627 628 // Blur *ALL* the rows. 629 for (int y = 0; y < dstH; y++) { 630 blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW); 631 src += srcStride; 632 dst += dstStride; 633 } 634 } 635 636 SK_ATTRIBUTE(noinline) static void direct_blur_x( 637 int radius, uint16_t* gauss, 638 const uint8_t* src, size_t srcStride, int srcW, 639 uint8_t* dst, size_t dstStride, int dstW, int dstH) { 640 641 switch (radius) { 642 case 1: 643 blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); 644 break; 645 646 case 2: 647 blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); 648 break; 649 650 case 3: 651 blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); 652 break; 653 654 case 4: 655 blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH); 656 break; 657 658 default: 659 SkASSERTF(false, "The radius %d is not handled\n", radius); 660 } 661 } 662 663 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N 664 // functions, but end up being simpler because there is no complicated shift of registers. We 665 // start with the non-traditional form of the gaussian filter. In the following r is the value 666 // when added generates the next value in the column. 667 // 668 // D[n+0r] = S[n+0r]*G[1] 669 // + S[n+1r]*G[0] 670 // + S[n+2r]*G[1] 671 // 672 // Expanding out in a way similar to blur_x_radius_N for specific values of n. 673 // 674 // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1] 675 // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1] 676 // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1] 677 // 678 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we 679 // can look at the calculation form the point of view of a source value. 680 // 681 // Given S[n+0r]: 682 // D[n+0r] += S[n+0r]*G[1]; 683 // /* D[n+0r] is done and can be stored now. */ 684 // D[n+1r] += S[n+0r]*G[0]; 685 // D[n+2r] = S[n+0r]*G[1]; 686 // 687 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in 688 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also, 689 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice 690 // how values flow in the following two iterations in source. 691 // 692 // D[n+0r] += S[n+0r]*G[1] 693 // D[n+1r] += S[n+0r]*G[0] 694 // D[n+2r] = S[n+0r]*G[1] 695 // /* ------- */ 696 // D[n+1r] += S[n+1r]*G[1] 697 // D[n+2r] += S[n+1r]*G[0] 698 // D[n+3r] = S[n+1r]*G[1] 699 // 700 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes 701 // to the following. 702 // 703 // answer = d01 + S[n+0r]*G[1] 704 // d01 = d12 + S[n+0r]*G[0] 705 // d12 = S[n+0r]*G[1] 706 // return answer 707 // 708 // Finally, this can be ganged into SIMD style. 709 // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1] 710 // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] 711 // d12[0..7] = S[n+0r..n+0r+7]*G[1] 712 // return answer[0..7] 713 static SK_ALWAYS_INLINE Sk8h blur_y_radius_1( 714 const Sk8h& s0, 715 const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&, 716 Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { 717 auto v0 = s0.mulHi(g0); 718 auto v1 = s0.mulHi(g1); 719 720 Sk8h answer = *d01 + v1; 721 *d01 = *d12 + v0; 722 *d12 = v1 + kHalf; 723 724 return answer; 725 } 726 727 static SK_ALWAYS_INLINE Sk8h blur_y_radius_2( 728 const Sk8h& s0, 729 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&, 730 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) { 731 auto v0 = s0.mulHi(g0); 732 auto v1 = s0.mulHi(g1); 733 auto v2 = s0.mulHi(g2); 734 735 Sk8h answer = *d01 + v2; 736 *d01 = *d12 + v1; 737 *d12 = *d23 + v0; 738 *d23 = *d34 + v1; 739 *d34 = v2 + kHalf; 740 741 return answer; 742 } 743 744 static SK_ALWAYS_INLINE Sk8h blur_y_radius_3( 745 const Sk8h& s0, 746 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&, 747 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) { 748 auto v0 = s0.mulHi(g0); 749 auto v1 = s0.mulHi(g1); 750 auto v2 = s0.mulHi(g2); 751 auto v3 = s0.mulHi(g3); 752 753 Sk8h answer = *d01 + v3; 754 *d01 = *d12 + v2; 755 *d12 = *d23 + v1; 756 *d23 = *d34 + v0; 757 *d34 = *d45 + v1; 758 *d45 = *d56 + v2; 759 *d56 = v3 + kHalf; 760 761 return answer; 762 } 763 764 static SK_ALWAYS_INLINE Sk8h blur_y_radius_4( 765 const Sk8h& s0, 766 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, 767 Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) { 768 auto v0 = s0.mulHi(g0); 769 auto v1 = s0.mulHi(g1); 770 auto v2 = s0.mulHi(g2); 771 auto v3 = s0.mulHi(g3); 772 auto v4 = s0.mulHi(g4); 773 774 Sk8h answer = *d01 + v4; 775 *d01 = *d12 + v3; 776 *d12 = *d23 + v2; 777 *d23 = *d34 + v1; 778 *d34 = *d45 + v0; 779 *d45 = *d56 + v1; 780 *d56 = *d67 + v2; 781 *d67 = *d78 + v3; 782 *d78 = v4 + kHalf; 783 784 return answer; 785 } 786 787 using BlurY = decltype(blur_y_radius_1); 788 789 // BlurY will be one of blur_y_radius_(1|2|3|4). 790 static SK_ALWAYS_INLINE void blur_column( 791 BlurY blur, int radius, int width, 792 const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4, 793 const uint8_t* src, size_t srcStride, int srcH, 794 uint8_t* dst, size_t dstStride) { 795 Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf}, 796 d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf}; 797 798 auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) { 799 store(to, v0, width); 800 to += dstStride; 801 store(to, v1, width); 802 return to + dstStride; 803 }; 804 805 for (int y = 0; y < srcH; y += 1) { 806 auto s = load(src, width); 807 auto b = blur(s, 808 g0, g1, g2, g3, g4, 809 &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78); 810 store(dst, b, width); 811 src += srcStride; 812 dst += dstStride; 813 } 814 815 if (radius >= 1) { 816 dst = flush(dst, d01, d12); 817 } 818 if (radius >= 2) { 819 dst = flush(dst, d23, d34); 820 } 821 if (radius >= 3) { 822 dst = flush(dst, d45, d56); 823 } 824 if (radius >= 4) { 825 flush(dst, d67, d78); 826 } 827 } 828 829 // BlurY will be one of blur_y_radius_(1|2|3|4). 830 static SK_ALWAYS_INLINE void blur_y_rect( 831 BlurY blur, int radius, uint16_t *gauss, 832 const uint8_t *src, size_t srcStride, int srcW, int srcH, 833 uint8_t *dst, size_t dstStride) { 834 835 Sk8h g0{gauss[0]}, 836 g1{gauss[1]}, 837 g2{gauss[2]}, 838 g3{gauss[3]}, 839 g4{gauss[4]}; 840 841 int x = 0; 842 for (; x <= srcW - 8; x += 8) { 843 blur_column(blur, radius, 8, 844 g0, g1, g2, g3, g4, 845 src, srcStride, srcH, 846 dst, dstStride); 847 src += 8; 848 dst += 8; 849 } 850 851 int xTail = srcW - x; 852 if (xTail > 0) { 853 blur_column(blur, radius, xTail, 854 g0, g1, g2, g3, g4, 855 src, srcStride, srcH, 856 dst, dstStride); 857 } 858 } 859 860 SK_ATTRIBUTE(noinline) static void direct_blur_y( 861 int radius, uint16_t* gauss, 862 const uint8_t* src, size_t srcStride, int srcW, int srcH, 863 uint8_t* dst, size_t dstStride) { 864 865 switch (radius) { 866 case 1: 867 blur_y_rect(blur_y_radius_1, 1, gauss, 868 src, srcStride, srcW, srcH, 869 dst, dstStride); 870 break; 871 872 case 2: 873 blur_y_rect(blur_y_radius_2, 2, gauss, 874 src, srcStride, srcW, srcH, 875 dst, dstStride); 876 break; 877 878 case 3: 879 blur_y_rect(blur_y_radius_3, 3, gauss, 880 src, srcStride, srcW, srcH, 881 dst, dstStride); 882 break; 883 884 case 4: 885 blur_y_rect(blur_y_radius_4, 4, gauss, 886 src, srcStride, srcW, srcH, 887 dst, dstStride); 888 break; 889 890 default: 891 SkASSERTF(false, "The radius %d is not handled\n", radius); 892 } 893 } 894 895 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) { 896 SkASSERT(0 <= sigmaX && sigmaX < 2); 897 SkASSERT(0 <= sigmaY && sigmaY < 2); 898 899 SkGaussFilter filterX{sigmaX, SkGaussFilter::Type::Bessel}, 900 filterY{sigmaY, SkGaussFilter::Type::Bessel}; 901 902 int radiusX = filterX.radius(), 903 radiusY = filterY.radius(); 904 905 SkASSERT(radiusX <= 4 && radiusY <= 4); 906 907 auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) { 908 int i = 0; 909 for (double d : filter) { 910 factors[i++] = static_cast<uint16_t>(round(d * (1 << 16))); 911 } 912 }; 913 914 uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax], 915 gaussFactorsY[SkGaussFilter::kGaussArrayMax]; 916 917 prepareGauss(filterX, gaussFactorsX); 918 prepareGauss(filterY, gaussFactorsY); 919 920 *dst = prepare_destination(radiusX, radiusY, src); 921 if (src.fImage == nullptr) { 922 return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)}; 923 } 924 if (dst->fImage == nullptr) { 925 dst->fBounds.setEmpty(); 926 return {0, 0}; 927 } 928 929 int srcW = src.fBounds.width(), 930 srcH = src.fBounds.height(); 931 932 int dstW = dst->fBounds.width(), 933 dstH = dst->fBounds.height(); 934 935 size_t srcStride = src.fRowBytes, 936 dstStride = dst->fRowBytes; 937 938 //TODO: handle bluring in only one direction. 939 940 // Blur vertically and copy to destination. 941 direct_blur_y(radiusY, gaussFactorsY, 942 src.fImage, srcStride, srcW, srcH, 943 dst->fImage + radiusX, dstStride); 944 945 // Blur horizontally in place. 946 direct_blur_x(radiusX, gaussFactorsX, 947 dst->fImage + radiusX, dstStride, srcW, 948 dst->fImage, dstStride, dstW, dstH); 949 950 return {radiusX, radiusY}; 951 } 952 953 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the 954 // API forces the sigmas to be the same. 955 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const { 956 957 if (fSigmaW < 2.0 && fSigmaH < 2.0) { 958 return small_blur(fSigmaW, fSigmaH, src, dst); 959 } 960 961 // 1024 is a place holder guess until more analysis can be done. 962 SkSTArenaAlloc<1024> alloc; 963 964 PlanningInterface* planW = alloc.make<PlanGauss>(fSigmaW); 965 PlanningInterface* planH = alloc.make<PlanGauss>(fSigmaH); 966 967 int borderW = planW->border(), 968 borderH = planH->border(); 969 SkASSERT(borderH >= 0 && borderW >= 0); 970 971 *dst = prepare_destination(borderW, borderH, src); 972 if (src.fImage == nullptr) { 973 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; 974 } 975 if (dst->fImage == nullptr) { 976 dst->fBounds.setEmpty(); 977 return {0, 0}; 978 } 979 980 int srcW = src.fBounds.width(), 981 srcH = src.fBounds.height(), 982 dstW = dst->fBounds.width(), 983 dstH = dst->fBounds.height(); 984 SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0); 985 986 auto bufferSize = std::max(planW->bufferSize(), planH->bufferSize()); 987 auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize); 988 989 if (planW->needsBlur() && planH->needsBlur()) { 990 // Blur both directions. 991 int tmpW = srcH, 992 tmpH = dstW; 993 994 auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH); 995 996 // Blur horizontally, and transpose. 997 auto scanW = planW->makeBlurScan(&alloc, srcW, buffer); 998 for (int y = 0; y < srcH; y++) { 999 auto srcStart = &src.fImage[y * src.fRowBytes]; 1000 auto tmpStart = &tmp[y]; 1001 scanW->blur(srcStart, 1, srcStart + srcW, 1002 tmpStart, tmpW, tmpStart + tmpW * tmpH); 1003 } 1004 1005 // Blur vertically (scan in memory order because of the transposition), 1006 // and transpose back to the original orientation. 1007 auto scanH = planH->makeBlurScan(&alloc, tmpW, buffer); 1008 for (int y = 0; y < tmpH; y++) { 1009 auto tmpStart = &tmp[y * tmpW]; 1010 auto dstStart = &dst->fImage[y]; 1011 1012 scanH->blur(tmpStart, 1, tmpStart + tmpW, 1013 dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH); 1014 } 1015 } else { 1016 // Copy to dst. No Blur. 1017 SkASSERT(false); // should not get here 1018 for (int y = 0; y < srcH; y++) { 1019 std::memcpy(&dst->fImage[y * dst->fRowBytes], &src.fImage[y * src.fRowBytes], dstW); 1020 } 1021 } 1022 1023 return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)}; 1024 } 1025