Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2011 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBlitRect_opts_SSE2.h"
     10 #include "SkBlitRow.h"
     11 #include "SkColorPriv.h"
     12 
     13 /* Simple blitting of opaque rectangles less than 31 pixels wide:
     14  * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
     15  */
     16 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination,
     17                                   int width, int height,
     18                                   size_t rowBytes, uint32_t color) {
     19     SkASSERT(255 == SkGetPackedA32(color));
     20     SkASSERT(width > 0);
     21     SkASSERT(width < 31);
     22 
     23     while (--height >= 0) {
     24         SkPMColor* dst = destination;
     25         int count = width;
     26 
     27         while (count > 4) {
     28             *dst++ = color;
     29             *dst++ = color;
     30             *dst++ = color;
     31             *dst++ = color;
     32             count -= 4;
     33         }
     34 
     35         while (count > 0) {
     36             *dst++ = color;
     37             --count;
     38         }
     39 
     40         destination = (uint32_t*)((char*)destination + rowBytes);
     41     }
     42 }
     43 
     44 /*
     45  * Fast blitting of opaque rectangles at least 31 pixels wide:
     46  * inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2.
     47  * A 31 pixel rectangle is guaranteed to have at least one
     48  * 16-pixel aligned span that can take advantage of mm_store.
     49  */
     50 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination,
     51                                 int width, int height,
     52                                 size_t rowBytes, uint32_t color) {
     53     SkASSERT(255 == SkGetPackedA32(color));
     54     SkASSERT(width >= 31);
     55 
     56     __m128i color_wide = _mm_set1_epi32(color);
     57     while (--height >= 0) {
     58         // Prefetching one row ahead to L1 cache can equal hardware
     59         // performance for large/tall rects, but never *beats*
     60         // hardware performance.
     61         SkPMColor* dst = destination;
     62         int count = width;
     63 
     64         while (((size_t)dst) & 0x0F) {
     65             *dst++ = color;
     66             --count;
     67         }
     68         __m128i *d = reinterpret_cast<__m128i*>(dst);
     69 
     70         // Googling suggests _mm_stream is only going to beat _mm_store
     71         // for things that wouldn't fit in L2 cache anyway, typically
     72         // >500kB, and precisely fill cache lines.  For us, with
     73         // arrays > 100k elements _mm_stream is still 100%+ slower than
     74         // mm_store.
     75 
     76         // Unrolling to count >= 64 is a break-even for most
     77         // input patterns; we seem to be saturating the bus and having
     78         // low enough overhead at 32.
     79 
     80         while (count >= 32) {
     81             _mm_store_si128(d++, color_wide);
     82             _mm_store_si128(d++, color_wide);
     83             _mm_store_si128(d++, color_wide);
     84             _mm_store_si128(d++, color_wide);
     85             _mm_store_si128(d++, color_wide);
     86             _mm_store_si128(d++, color_wide);
     87             _mm_store_si128(d++, color_wide);
     88             _mm_store_si128(d++, color_wide);
     89             count -= 32;
     90         }
     91         if (count >= 16) {
     92             _mm_store_si128(d++, color_wide);
     93             _mm_store_si128(d++, color_wide);
     94             _mm_store_si128(d++, color_wide);
     95             _mm_store_si128(d++, color_wide);
     96             count -= 16;
     97         }
     98         dst = reinterpret_cast<uint32_t*>(d);
     99 
    100         // Unrolling the loop in the Narrow code is a significant performance
    101         // gain, but unrolling this loop appears to make no difference in
    102         // benchmarks with either mm_store_si128 or individual sets.
    103 
    104         while (count > 0) {
    105             *dst++ = color;
    106             --count;
    107         }
    108 
    109         destination = (uint32_t*)((char*)destination + rowBytes);
    110     }
    111 }
    112 
    113 void ColorRect32_SSE2(SkPMColor* destination,
    114                       int width, int height,
    115                       size_t rowBytes, uint32_t color) {
    116     if (0 == height || 0 == width || 0 == color) {
    117         return;
    118     }
    119     unsigned colorA = SkGetPackedA32(color);
    120     colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423).
    121     if (255 == colorA) {
    122         if (width < 31) {
    123             BlitRect32_OpaqueNarrow_SSE2(destination, width, height,
    124                                          rowBytes, color);
    125         } else {
    126             BlitRect32_OpaqueWide_SSE2(destination, width, height,
    127                                        rowBytes, color);
    128         }
    129     } else {
    130         SkBlitRow::ColorRect32(destination, width, height, rowBytes, color);
    131     }
    132 }
    133