1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "SkConvolver.h" 6 #include "SkTArray.h" 7 8 namespace { 9 10 // Converts the argument to an 8-bit unsigned value by clamping to the range 11 // 0-255. 12 inline unsigned char ClampTo8(int a) { 13 if (static_cast<unsigned>(a) < 256) { 14 return a; // Avoid the extra check in the common case. 15 } 16 if (a < 0) { 17 return 0; 18 } 19 return 255; 20 } 21 22 // Stores a list of rows in a circular buffer. The usage is you write into it 23 // by calling AdvanceRow. It will keep track of which row in the buffer it 24 // should use next, and the total number of rows added. 25 class CircularRowBuffer { 26 public: 27 // The number of pixels in each row is given in |sourceRowPixelWidth|. 28 // The maximum number of rows needed in the buffer is |maxYFilterSize| 29 // (we only need to store enough rows for the biggest filter). 30 // 31 // We use the |firstInputRow| to compute the coordinates of all of the 32 // following rows returned by Advance(). 33 CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize, 34 int firstInputRow) 35 : fRowByteWidth(destRowPixelWidth * 4), 36 fNumRows(maxYFilterSize), 37 fNextRow(0), 38 fNextRowCoordinate(firstInputRow) { 39 fBuffer.reset(fRowByteWidth * maxYFilterSize); 40 fRowAddresses.reset(fNumRows); 41 } 42 43 // Moves to the next row in the buffer, returning a pointer to the beginning 44 // of it. 45 unsigned char* advanceRow() { 46 unsigned char* row = &fBuffer[fNextRow * fRowByteWidth]; 47 fNextRowCoordinate++; 48 49 // Set the pointer to the next row to use, wrapping around if necessary. 50 fNextRow++; 51 if (fNextRow == fNumRows) { 52 fNextRow = 0; 53 } 54 return row; 55 } 56 57 // Returns a pointer to an "unrolled" array of rows. These rows will start 58 // at the y coordinate placed into |*firstRowIndex| and will continue in 59 // order for the maximum number of rows in this circular buffer. 60 // 61 // The |firstRowIndex_| may be negative. This means the circular buffer 62 // starts before the top of the image (it hasn't been filled yet). 63 unsigned char* const* GetRowAddresses(int* firstRowIndex) { 64 // Example for a 4-element circular buffer holding coords 6-9. 65 // Row 0 Coord 8 66 // Row 1 Coord 9 67 // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10. 68 // Row 3 Coord 7 69 // 70 // The "next" row is also the first (lowest) coordinate. This computation 71 // may yield a negative value, but that's OK, the math will work out 72 // since the user of this buffer will compute the offset relative 73 // to the firstRowIndex and the negative rows will never be used. 74 *firstRowIndex = fNextRowCoordinate - fNumRows; 75 76 int curRow = fNextRow; 77 for (int i = 0; i < fNumRows; i++) { 78 fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth]; 79 80 // Advance to the next row, wrapping if necessary. 81 curRow++; 82 if (curRow == fNumRows) { 83 curRow = 0; 84 } 85 } 86 return &fRowAddresses[0]; 87 } 88 89 private: 90 // The buffer storing the rows. They are packed, each one fRowByteWidth. 91 SkTArray<unsigned char> fBuffer; 92 93 // Number of bytes per row in the |buffer|. 94 int fRowByteWidth; 95 96 // The number of rows available in the buffer. 97 int fNumRows; 98 99 // The next row index we should write into. This wraps around as the 100 // circular buffer is used. 101 int fNextRow; 102 103 // The y coordinate of the |fNextRow|. This is incremented each time a 104 // new row is appended and does not wrap. 105 int fNextRowCoordinate; 106 107 // Buffer used by GetRowAddresses(). 108 SkTArray<unsigned char*> fRowAddresses; 109 }; 110 111 // Convolves horizontally along a single row. The row data is given in 112 // |srcData| and continues for the numValues() of the filter. 113 template<bool hasAlpha> 114 void ConvolveHorizontally(const unsigned char* srcData, 115 const SkConvolutionFilter1D& filter, 116 unsigned char* outRow) { 117 // Loop over each pixel on this row in the output image. 118 int numValues = filter.numValues(); 119 for (int outX = 0; outX < numValues; outX++) { 120 // Get the filter that determines the current output pixel. 121 int filterOffset, filterLength; 122 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 123 filter.FilterForValue(outX, &filterOffset, &filterLength); 124 125 // Compute the first pixel in this row that the filter affects. It will 126 // touch |filterLength| pixels (4 bytes each) after this. 127 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; 128 129 // Apply the filter to the row to get the destination pixel in |accum|. 130 int accum[4] = {0}; 131 for (int filterX = 0; filterX < filterLength; filterX++) { 132 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; 133 accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; 134 accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; 135 accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; 136 if (hasAlpha) { 137 accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; 138 } 139 } 140 141 // Bring this value back in range. All of the filter scaling factors 142 // are in fixed point with kShiftBits bits of fractional part. 143 accum[0] >>= SkConvolutionFilter1D::kShiftBits; 144 accum[1] >>= SkConvolutionFilter1D::kShiftBits; 145 accum[2] >>= SkConvolutionFilter1D::kShiftBits; 146 if (hasAlpha) { 147 accum[3] >>= SkConvolutionFilter1D::kShiftBits; 148 } 149 150 // Store the new pixel. 151 outRow[outX * 4 + 0] = ClampTo8(accum[0]); 152 outRow[outX * 4 + 1] = ClampTo8(accum[1]); 153 outRow[outX * 4 + 2] = ClampTo8(accum[2]); 154 if (hasAlpha) { 155 outRow[outX * 4 + 3] = ClampTo8(accum[3]); 156 } 157 } 158 } 159 160 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize). We originally 161 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles 162 // suffer here too. 163 // 164 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. https://bug.skia.org/2575 165 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) 166 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), noinline)) 167 #else 168 #define SK_MAYBE_DISABLE_VECTORIZATION 169 #endif 170 171 SK_MAYBE_DISABLE_VECTORIZATION 172 static void ConvolveHorizontallyAlpha(const unsigned char* srcData, 173 const SkConvolutionFilter1D& filter, 174 unsigned char* outRow) { 175 return ConvolveHorizontally<true>(srcData, filter, outRow); 176 } 177 178 SK_MAYBE_DISABLE_VECTORIZATION 179 static void ConvolveHorizontallyNoAlpha(const unsigned char* srcData, 180 const SkConvolutionFilter1D& filter, 181 unsigned char* outRow) { 182 return ConvolveHorizontally<false>(srcData, filter, outRow); 183 } 184 185 #undef SK_MAYBE_DISABLE_VECTORIZATION 186 187 188 // Does vertical convolution to produce one output row. The filter values and 189 // length are given in the first two parameters. These are applied to each 190 // of the rows pointed to in the |sourceDataRows| array, with each row 191 // being |pixelWidth| wide. 192 // 193 // The output must have room for |pixelWidth * 4| bytes. 194 template<bool hasAlpha> 195 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 196 int filterLength, 197 unsigned char* const* sourceDataRows, 198 int pixelWidth, 199 unsigned char* outRow) { 200 // We go through each column in the output and do a vertical convolution, 201 // generating one output pixel each time. 202 for (int outX = 0; outX < pixelWidth; outX++) { 203 // Compute the number of bytes over in each row that the current column 204 // we're convolving starts at. The pixel will cover the next 4 bytes. 205 int byteOffset = outX * 4; 206 207 // Apply the filter to one column of pixels. 208 int accum[4] = {0}; 209 for (int filterY = 0; filterY < filterLength; filterY++) { 210 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; 211 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; 212 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; 213 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; 214 if (hasAlpha) { 215 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; 216 } 217 } 218 219 // Bring this value back in range. All of the filter scaling factors 220 // are in fixed point with kShiftBits bits of precision. 221 accum[0] >>= SkConvolutionFilter1D::kShiftBits; 222 accum[1] >>= SkConvolutionFilter1D::kShiftBits; 223 accum[2] >>= SkConvolutionFilter1D::kShiftBits; 224 if (hasAlpha) { 225 accum[3] >>= SkConvolutionFilter1D::kShiftBits; 226 } 227 228 // Store the new pixel. 229 outRow[byteOffset + 0] = ClampTo8(accum[0]); 230 outRow[byteOffset + 1] = ClampTo8(accum[1]); 231 outRow[byteOffset + 2] = ClampTo8(accum[2]); 232 if (hasAlpha) { 233 unsigned char alpha = ClampTo8(accum[3]); 234 235 // Make sure the alpha channel doesn't come out smaller than any of the 236 // color channels. We use premultipled alpha channels, so this should 237 // never happen, but rounding errors will cause this from time to time. 238 // These "impossible" colors will cause overflows (and hence random pixel 239 // values) when the resulting bitmap is drawn to the screen. 240 // 241 // We only need to do this when generating the final output row (here). 242 int maxColorChannel = SkTMax(outRow[byteOffset + 0], 243 SkTMax(outRow[byteOffset + 1], 244 outRow[byteOffset + 2])); 245 if (alpha < maxColorChannel) { 246 outRow[byteOffset + 3] = maxColorChannel; 247 } else { 248 outRow[byteOffset + 3] = alpha; 249 } 250 } else { 251 // No alpha channel, the image is opaque. 252 outRow[byteOffset + 3] = 0xff; 253 } 254 } 255 } 256 257 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 258 int filterLength, 259 unsigned char* const* sourceDataRows, 260 int pixelWidth, 261 unsigned char* outRow, 262 bool sourceHasAlpha) { 263 if (sourceHasAlpha) { 264 ConvolveVertically<true>(filterValues, filterLength, 265 sourceDataRows, pixelWidth, 266 outRow); 267 } else { 268 ConvolveVertically<false>(filterValues, filterLength, 269 sourceDataRows, pixelWidth, 270 outRow); 271 } 272 } 273 274 } // namespace 275 276 // SkConvolutionFilter1D --------------------------------------------------------- 277 278 SkConvolutionFilter1D::SkConvolutionFilter1D() 279 : fMaxFilter(0) { 280 } 281 282 SkConvolutionFilter1D::~SkConvolutionFilter1D() { 283 } 284 285 void SkConvolutionFilter1D::AddFilter(int filterOffset, 286 const ConvolutionFixed* filterValues, 287 int filterLength) { 288 // It is common for leading/trailing filter values to be zeros. In such 289 // cases it is beneficial to only store the central factors. 290 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on 291 // a 1080p image this optimization gives a ~10% speed improvement. 292 int filterSize = filterLength; 293 int firstNonZero = 0; 294 while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) { 295 firstNonZero++; 296 } 297 298 if (firstNonZero < filterLength) { 299 // Here we have at least one non-zero factor. 300 int lastNonZero = filterLength - 1; 301 while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) { 302 lastNonZero--; 303 } 304 305 filterOffset += firstNonZero; 306 filterLength = lastNonZero + 1 - firstNonZero; 307 SkASSERT(filterLength > 0); 308 309 fFilterValues.append(filterLength, &filterValues[firstNonZero]); 310 } else { 311 // Here all the factors were zeroes. 312 filterLength = 0; 313 } 314 315 FilterInstance instance; 316 317 // We pushed filterLength elements onto fFilterValues 318 instance.fDataLocation = (static_cast<int>(fFilterValues.count()) - 319 filterLength); 320 instance.fOffset = filterOffset; 321 instance.fTrimmedLength = filterLength; 322 instance.fLength = filterSize; 323 fFilters.push(instance); 324 325 fMaxFilter = SkTMax(fMaxFilter, filterLength); 326 } 327 328 const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter( 329 int* specifiedFilterlength, 330 int* filterOffset, 331 int* filterLength) const { 332 const FilterInstance& filter = fFilters[0]; 333 *filterOffset = filter.fOffset; 334 *filterLength = filter.fTrimmedLength; 335 *specifiedFilterlength = filter.fLength; 336 if (filter.fTrimmedLength == 0) { 337 return nullptr; 338 } 339 340 return &fFilterValues[filter.fDataLocation]; 341 } 342 343 bool BGRAConvolve2D(const unsigned char* sourceData, 344 int sourceByteRowStride, 345 bool sourceHasAlpha, 346 const SkConvolutionFilter1D& filterX, 347 const SkConvolutionFilter1D& filterY, 348 int outputByteRowStride, 349 unsigned char* output, 350 const SkConvolutionProcs& convolveProcs, 351 bool useSimdIfPossible) { 352 353 int maxYFilterSize = filterY.maxFilter(); 354 355 // The next row in the input that we will generate a horizontally 356 // convolved row for. If the filter doesn't start at the beginning of the 357 // image (this is the case when we are only resizing a subset), then we 358 // don't want to generate any output rows before that. Compute the starting 359 // row for convolution as the first pixel for the first vertical filter. 360 int filterOffset, filterLength; 361 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 362 filterY.FilterForValue(0, &filterOffset, &filterLength); 363 int nextXRow = filterOffset; 364 365 // We loop over each row in the input doing a horizontal convolution. This 366 // will result in a horizontally convolved image. We write the results into 367 // a circular buffer of convolved rows and do vertical convolution as rows 368 // are available. This prevents us from having to store the entire 369 // intermediate image and helps cache coherency. 370 // We will need four extra rows to allow horizontal convolution could be done 371 // simultaneously. We also pad each row in row buffer to be aligned-up to 372 // 16 bytes. 373 // TODO(jiesun): We do not use aligned load from row buffer in vertical 374 // convolution pass yet. Somehow Windows does not like it. 375 int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; 376 int rowBufferHeight = maxYFilterSize + 377 (convolveProcs.fConvolve4RowsHorizontally ? 4 : 0); 378 379 // check for too-big allocation requests : crbug.com/528628 380 { 381 int64_t size = sk_64_mul(rowBufferWidth, rowBufferHeight); 382 // need some limit, to avoid over-committing success from malloc, but then 383 // crashing when we try to actually use the memory. 384 // 100meg seems big enough to allow "normal" zoom factors and image sizes through 385 // while avoiding the crash seen by the bug (crbug.com/528628) 386 if (size > 100 * 1024 * 1024) { 387 // SkDebugf("BGRAConvolve2D: tmp allocation [%lld] too big\n", size); 388 return false; 389 } 390 } 391 392 CircularRowBuffer rowBuffer(rowBufferWidth, 393 rowBufferHeight, 394 filterOffset); 395 396 // Loop over every possible output row, processing just enough horizontal 397 // convolutions to run each subsequent vertical convolution. 398 SkASSERT(outputByteRowStride >= filterX.numValues() * 4); 399 int numOutputRows = filterY.numValues(); 400 401 // We need to check which is the last line to convolve before we advance 4 402 // lines in one iteration. 403 int lastFilterOffset, lastFilterLength; 404 405 // SSE2 can access up to 3 extra pixels past the end of the 406 // buffer. At the bottom of the image, we have to be careful 407 // not to access data past the end of the buffer. Normally 408 // we fall back to the C++ implementation for the last row. 409 // If the last row is less than 3 pixels wide, we may have to fall 410 // back to the C++ version for more rows. Compute how many 411 // rows we need to avoid the SSE implementation for here. 412 filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset, 413 &lastFilterLength); 414 int avoidSimdRows = 1 + convolveProcs.fExtraHorizontalReads / 415 (lastFilterOffset + lastFilterLength); 416 417 filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, 418 &lastFilterLength); 419 420 for (int outY = 0; outY < numOutputRows; outY++) { 421 filterValues = filterY.FilterForValue(outY, 422 &filterOffset, &filterLength); 423 424 // Generate output rows until we have enough to run the current filter. 425 while (nextXRow < filterOffset + filterLength) { 426 if (convolveProcs.fConvolve4RowsHorizontally && 427 nextXRow + 3 < lastFilterOffset + lastFilterLength - 428 avoidSimdRows) { 429 const unsigned char* src[4]; 430 unsigned char* outRow[4]; 431 for (int i = 0; i < 4; ++i) { 432 src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride]; 433 outRow[i] = rowBuffer.advanceRow(); 434 } 435 convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth); 436 nextXRow += 4; 437 } else { 438 // Check if we need to avoid SSE2 for this row. 439 if (convolveProcs.fConvolveHorizontally && 440 nextXRow < lastFilterOffset + lastFilterLength - 441 avoidSimdRows) { 442 convolveProcs.fConvolveHorizontally( 443 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 444 filterX, rowBuffer.advanceRow(), sourceHasAlpha); 445 } else { 446 if (sourceHasAlpha) { 447 ConvolveHorizontallyAlpha( 448 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 449 filterX, rowBuffer.advanceRow()); 450 } else { 451 ConvolveHorizontallyNoAlpha( 452 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], 453 filterX, rowBuffer.advanceRow()); 454 } 455 } 456 nextXRow++; 457 } 458 } 459 460 // Compute where in the output image this row of final data will go. 461 unsigned char* curOutputRow = &output[(uint64_t)outY * outputByteRowStride]; 462 463 // Get the list of rows that the circular buffer has, in order. 464 int firstRowInCircularBuffer; 465 unsigned char* const* rowsToConvolve = 466 rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); 467 468 // Now compute the start of the subset of those rows that the filter 469 // needs. 470 unsigned char* const* firstRowForFilter = 471 &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; 472 473 if (convolveProcs.fConvolveVertically) { 474 convolveProcs.fConvolveVertically(filterValues, filterLength, 475 firstRowForFilter, 476 filterX.numValues(), curOutputRow, 477 sourceHasAlpha); 478 } else { 479 ConvolveVertically(filterValues, filterLength, 480 firstRowForFilter, 481 filterX.numValues(), curOutputRow, 482 sourceHasAlpha); 483 } 484 } 485 return true; 486 } 487