1 /* 2 * Copyright 2011 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "SkBlitRect_opts_SSE2.h" 9 #include "SkBlitRow.h" 10 #include "SkColorPriv.h" 11 12 #include <emmintrin.h> 13 14 /** Simple blitting of opaque rectangles less than 31 pixels wide: 15 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 16 */ 17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, 18 int width, int height, 19 size_t rowBytes, uint32_t color) { 20 SkASSERT(255 == SkGetPackedA32(color)); 21 SkASSERT(width > 0); 22 SkASSERT(width < 31); 23 24 while (--height >= 0) { 25 SkPMColor* dst = destination; 26 int count = width; 27 28 while (count > 4) { 29 *dst++ = color; 30 *dst++ = color; 31 *dst++ = color; 32 *dst++ = color; 33 count -= 4; 34 } 35 36 while (count > 0) { 37 *dst++ = color; 38 --count; 39 } 40 41 destination = (uint32_t*)((char*)destination + rowBytes); 42 } 43 } 44 45 /** 46 Fast blitting of opaque rectangles at least 31 pixels wide: 47 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. 48 A 31 pixel rectangle is guaranteed to have at least one 49 16-pixel aligned span that can take advantage of mm_store. 50 */ 51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, 52 int width, int height, 53 size_t rowBytes, uint32_t color) { 54 SkASSERT(255 == SkGetPackedA32(color)); 55 SkASSERT(width >= 31); 56 57 __m128i color_wide = _mm_set1_epi32(color); 58 while (--height >= 0) { 59 // Prefetching one row ahead to L1 cache can equal hardware 60 // performance for large/tall rects, but never *beats* 61 // hardware performance. 62 SkPMColor* dst = destination; 63 int count = width; 64 65 while (((size_t)dst) & 0x0F) { 66 *dst++ = color; 67 --count; 68 } 69 __m128i *d = reinterpret_cast<__m128i*>(dst); 70 71 // Googling suggests _mm_stream is only going to beat _mm_store 72 // for things that wouldn't fit in L2 cache anyway, typically 73 // >500kB, and precisely fill cache lines. For us, with 74 // arrays > 100k elements _mm_stream is still 100%+ slower than 75 // mm_store. 76 77 // Unrolling to count >= 64 is a break-even for most 78 // input patterns; we seem to be saturating the bus and having 79 // low enough overhead at 32. 80 81 while (count >= 32) { 82 _mm_store_si128(d++, color_wide); 83 _mm_store_si128(d++, color_wide); 84 _mm_store_si128(d++, color_wide); 85 _mm_store_si128(d++, color_wide); 86 _mm_store_si128(d++, color_wide); 87 _mm_store_si128(d++, color_wide); 88 _mm_store_si128(d++, color_wide); 89 _mm_store_si128(d++, color_wide); 90 count -= 32; 91 } 92 if (count >= 16) { 93 _mm_store_si128(d++, color_wide); 94 _mm_store_si128(d++, color_wide); 95 _mm_store_si128(d++, color_wide); 96 _mm_store_si128(d++, color_wide); 97 count -= 16; 98 } 99 dst = reinterpret_cast<uint32_t*>(d); 100 101 // Unrolling the loop in the Narrow code is a significant performance 102 // gain, but unrolling this loop appears to make no difference in 103 // benchmarks with either mm_store_si128 or individual sets. 104 105 while (count > 0) { 106 *dst++ = color; 107 --count; 108 } 109 110 destination = (uint32_t*)((char*)destination + rowBytes); 111 } 112 } 113 114 void ColorRect32_SSE2(SkPMColor* destination, 115 int width, int height, 116 size_t rowBytes, uint32_t color) { 117 if (0 == height || 0 == width || 0 == color) { 118 return; 119 } 120 unsigned colorA = SkGetPackedA32(color); 121 colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423). 122 if (255 == colorA) { 123 if (width < 31) { 124 BlitRect32_OpaqueNarrow_SSE2(destination, width, height, 125 rowBytes, color); 126 } else { 127 BlitRect32_OpaqueWide_SSE2(destination, width, height, 128 rowBytes, color); 129 } 130 } else { 131 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); 132 } 133 } 134