Home | History | Annotate | Download | only in bench
      1 /*
      2  * Copyright 2014 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "Benchmark.h"
      9 #include "SkRandom.h"
     10 #include "SkTemplates.h"
     11 #include "SkUtils.h"
     12 
     13 template <typename Memcpy32>
     14 class Memcpy32Bench : public Benchmark {
     15 public:
     16     explicit Memcpy32Bench(int count, Memcpy32 memcpy32, const char* name)
     17         : fCount(count)
     18         , fMemcpy32(memcpy32)
     19         , fName(SkStringPrintf("%s_%d", name, count)) {}
     20 
     21     virtual const char* onGetName() SK_OVERRIDE {
     22         return fName.c_str();
     23     }
     24 
     25     virtual bool isSuitableFor(Backend backend) SK_OVERRIDE {
     26         return backend == kNonRendering_Backend;
     27     }
     28 
     29     virtual void onPreDraw() SK_OVERRIDE {
     30         fDst.reset(fCount);
     31         fSrc.reset(fCount);
     32 
     33         SkRandom rand;
     34         for (int i = 0; i < fCount; i++) {
     35             fSrc[i] = rand.nextU();
     36         }
     37     }
     38 
     39     virtual void onDraw(const int loops, SkCanvas*) SK_OVERRIDE {
     40         for (int i = 0; i < loops; i++) {
     41             fMemcpy32(fDst, fSrc, fCount);
     42         }
     43     }
     44 
     45 private:
     46     SkAutoTMalloc<uint32_t> fDst, fSrc;
     47 
     48     int fCount;
     49     Memcpy32 fMemcpy32;
     50     const SkString fName;
     51 };
     52 
     53 template <typename Memcpy32>
     54 static Memcpy32Bench<Memcpy32>* Bench(int count, Memcpy32 memcpy32, const char* name) {
     55     return new Memcpy32Bench<Memcpy32>(count, memcpy32, name);
     56 }
     57 #define BENCH(memcpy32, count) DEF_BENCH(return Bench(count, memcpy32, #memcpy32); )
     58 
     59 
     60 // Let the libc developers do what they think is best.
     61 static void memcpy32_memcpy(uint32_t* dst, const uint32_t* src, int count) {
     62     memcpy(dst, src, sizeof(uint32_t) * count);
     63 }
     64 BENCH(memcpy32_memcpy, 10)
     65 BENCH(memcpy32_memcpy, 100)
     66 BENCH(memcpy32_memcpy, 1000)
     67 BENCH(memcpy32_memcpy, 10000)
     68 BENCH(memcpy32_memcpy, 100000)
     69 
     70 // Let the compiler's autovectorizer do what it thinks is best.
     71 static void memcpy32_autovectorize(uint32_t* dst, const uint32_t* src, int count) {
     72     while (count --> 0) {
     73         *dst++ = *src++;
     74     }
     75 }
     76 BENCH(memcpy32_autovectorize, 10)
     77 BENCH(memcpy32_autovectorize, 100)
     78 BENCH(memcpy32_autovectorize, 1000)
     79 BENCH(memcpy32_autovectorize, 10000)
     80 BENCH(memcpy32_autovectorize, 100000)
     81 
     82 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     83 
     84 // Align dst to 16 bytes, then use aligned stores.  src isn't algined, so use unaligned loads.
     85 static void memcpy32_sse2_align(uint32_t* dst, const uint32_t* src, int count) {
     86     if (count >= 16) {
     87         while (uintptr_t(dst) & 0xF) {
     88             *dst++ = *src++;
     89             count--;
     90         }
     91 
     92         __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
     93         const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
     94         dst += 16 * (count / 16);
     95         src += 16 * (count / 16);
     96         while (count >= 16) {
     97             __m128i a = _mm_loadu_si128(src128++);
     98             __m128i b = _mm_loadu_si128(src128++);
     99             __m128i c = _mm_loadu_si128(src128++);
    100             __m128i d = _mm_loadu_si128(src128++);
    101 
    102             _mm_store_si128(dst128++, a);
    103             _mm_store_si128(dst128++, b);
    104             _mm_store_si128(dst128++, c);
    105             _mm_store_si128(dst128++, d);
    106 
    107             count -= 16;
    108         }
    109     }
    110 
    111     while (count --> 0) {
    112         *dst++ = *src++;
    113     }
    114 }
    115 BENCH(memcpy32_sse2_align, 10)
    116 BENCH(memcpy32_sse2_align, 100)
    117 BENCH(memcpy32_sse2_align, 1000)
    118 BENCH(memcpy32_sse2_align, 10000)
    119 BENCH(memcpy32_sse2_align, 100000)
    120 
    121 // Leave both dst and src unaliged, and so use unaligned stores for dst and unaligned loads for src.
    122 static void memcpy32_sse2_unalign(uint32_t* dst, const uint32_t* src, int count) {
    123     __m128i* dst128 = reinterpret_cast<__m128i*>(dst);
    124     const __m128i* src128 = reinterpret_cast<const __m128i*>(src);
    125     dst += 16 * (count / 16);
    126     src += 16 * (count / 16);
    127     while (count >= 16) {
    128         __m128i a = _mm_loadu_si128(src128++);
    129         __m128i b = _mm_loadu_si128(src128++);
    130         __m128i c = _mm_loadu_si128(src128++);
    131         __m128i d = _mm_loadu_si128(src128++);
    132 
    133         _mm_storeu_si128(dst128++, a);
    134         _mm_storeu_si128(dst128++, b);
    135         _mm_storeu_si128(dst128++, c);
    136         _mm_storeu_si128(dst128++, d);
    137 
    138         count -= 16;
    139     }
    140 
    141     while (count --> 0) {
    142         *dst++ = *src++;
    143     }
    144 }
    145 BENCH(memcpy32_sse2_unalign, 10)
    146 BENCH(memcpy32_sse2_unalign, 100)
    147 BENCH(memcpy32_sse2_unalign, 1000)
    148 BENCH(memcpy32_sse2_unalign, 10000)
    149 BENCH(memcpy32_sse2_unalign, 100000)
    150 
    151 // Test our chosen best, from SkUtils.h
    152 BENCH(sk_memcpy32, 10)
    153 BENCH(sk_memcpy32, 100)
    154 BENCH(sk_memcpy32, 1000)
    155 BENCH(sk_memcpy32, 10000)
    156 BENCH(sk_memcpy32, 100000)
    157 
    158 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
    159 
    160 #undef BENCH
    161