1 /* 2 * Copyright 2014 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "Benchmark.h" 9 #include "SkRandom.h" 10 #include "SkTemplates.h" 11 #include "SkUtils.h" 12 13 template <typename Memcpy32> 14 class Memcpy32Bench : public Benchmark { 15 public: 16 explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name) 17 : fCount(count) 18 , fMemcpy32(memcpy32) 19 , fName(SkStringPrintf("%s_%d", name, count)) {} 20 21 virtual const char* onGetName() SK_OVERRIDE { 22 return fName.c_str(); 23 } 24 25 virtual bool isSuitableFor(Backend backend) SK_OVERRIDE { 26 return backend == kNonRendering_Backend; 27 } 28 29 virtual void onPreDraw() SK_OVERRIDE { 30 fDst.reset(fCount); 31 fSrc.reset(fCount); 32 33 SkRandom rand; 34 for (int i = 0; i < fCount; i++) { 35 fSrc[i] = rand.nextU(); 36 } 37 } 38 39 virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE { 40 for (int i = 0; i < loops; i++) { 41 fMemcpy32(fDst, fSrc, fCount); 42 } 43 } 44 45 private: 46 SkAutoTMalloc<uint32_t> fDst, fSrc; 47 48 int fCount; 49 Memcpy32 fMemcpy32; 50 const SkString fName; 51 }; 52 53 template <typename Memcpy32> 54 static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) { 55 return new Memcpy32Bench<Memcpy32>(count, memcpy32, name); 56 } 57 #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); ) 58 59 60 // Let the libc developers do what they think is best. 61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) { 62 memcpy(dst, src, sizeof(uint32_t) * count); 63 } 64 BENCH(memcpy32_memcpy, 10) 65 BENCH(memcpy32_memcpy, 100) 66 BENCH(memcpy32_memcpy, 1000) 67 BENCH(memcpy32_memcpy, 10000) 68 BENCH(memcpy32_memcpy, 100000) 69 70 // Let the compiler's autovectorizer do what it thinks is best. 71 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) { 72 while (count --> 0) { 73 *dst++ = *src++; 74 } 75 } 76 BENCH(memcpy32_autovectorize, 10) 77 BENCH(memcpy32_autovectorize, 100) 78 BENCH(memcpy32_autovectorize, 1000) 79 BENCH(memcpy32_autovectorize, 10000) 80 BENCH(memcpy32_autovectorize, 100000) 81 82 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 83 84 // Align dst to 16 bytes, then use aligned stores. src isn't algined, so use unaligned loads. 85 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) { 86 if (count >= 16) { 87 while (uintptr_t(dst) & 0xF) { 88 *dst++ = *src++; 89 count--; 90 } 91 92 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); 93 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); 94 dst += 16 * (count / 16); 95 src += 16 * (count / 16); 96 while (count >= 16) { 97 __m128i a = _mm_loadu_si128(src128++); 98 __m128i b = _mm_loadu_si128(src128++); 99 __m128i c = _mm_loadu_si128(src128++); 100 __m128i d = _mm_loadu_si128(src128++); 101 102 _mm_store_si128(dst128++, a); 103 _mm_store_si128(dst128++, b); 104 _mm_store_si128(dst128++, c); 105 _mm_store_si128(dst128++, d); 106 107 count -= 16; 108 } 109 } 110 111 while (count --> 0) { 112 *dst++ = *src++; 113 } 114 } 115 BENCH(memcpy32_sse2_align, 10) 116 BENCH(memcpy32_sse2_align, 100) 117 BENCH(memcpy32_sse2_align, 1000) 118 BENCH(memcpy32_sse2_align, 10000) 119 BENCH(memcpy32_sse2_align, 100000) 120 121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src. 122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) { 123 __m128i* dst128 = reinterpret_cast<__m128i*>(dst); 124 const __m128i* src128 = reinterpret_cast<const __m128i*>(src); 125 dst += 16 * (count / 16); 126 src += 16 * (count / 16); 127 while (count >= 16) { 128 __m128i a = _mm_loadu_si128(src128++); 129 __m128i b = _mm_loadu_si128(src128++); 130 __m128i c = _mm_loadu_si128(src128++); 131 __m128i d = _mm_loadu_si128(src128++); 132 133 _mm_storeu_si128(dst128++, a); 134 _mm_storeu_si128(dst128++, b); 135 _mm_storeu_si128(dst128++, c); 136 _mm_storeu_si128(dst128++, d); 137 138 count -= 16; 139 } 140 141 while (count --> 0) { 142 *dst++ = *src++; 143 } 144 } 145 BENCH(memcpy32_sse2_unalign, 10) 146 BENCH(memcpy32_sse2_unalign, 100) 147 BENCH(memcpy32_sse2_unalign, 1000) 148 BENCH(memcpy32_sse2_unalign, 10000) 149 BENCH(memcpy32_sse2_unalign, 100000) 150 151 // Test our chosen best, from SkUtils.h 152 BENCH(sk_memcpy32, 10) 153 BENCH(sk_memcpy32, 100) 154 BENCH(sk_memcpy32, 1000) 155 BENCH(sk_memcpy32, 10000) 156 BENCH(sk_memcpy32, 100000) 157 158 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 159 160 #undef BENCH 161