Home | History | Annotate | Download | only in bench
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "Benchmark.h"
      9 #include "SkTypes.h"
     10 
     11 /**
     12  * There's a good variety of ways to pack from int down to uint16_t with SSE,
     13  * depending on the specific instructions available.
     14  *
     15  * SSE2 offers an int -> int16_t pack instruction.  We can use this in two ways:
     16  *    - subtract off 32768, int -> int16_t, add 32768 back                                  (sse2_a)
     17  *    - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)
     18  * SSSE3 adds a byte shuffle, so we just put the bytes where we want them.                  (ssse3)
     19  * SSE41 added an int -> uint16_t pack instruction.                                         (sse41)
     20  *
     21  * Findings so far:
     22  *   - sse41 < ssse3 <<< sse2_b < sse2_a;
     23  *   - the ssse3 version is only slightly slower than the sse41 version, maybe not at all
     24  *   - the sse2_a is only slightly slower than the sse2_b version
     25  *   - the ssse3 and sse41 versions are about 3x faster than either sse2 version
     26  *   - the sse41 version seems to cause some code generation trouble.
     27  */
     28 
     29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     30 
     31 #include <immintrin.h>
     32 
     33 template <__m128i (kernel)(__m128i)>
     34 class pack_int_uint16_t_Bench : public Benchmark {
     35 public:
     36     pack_int_uint16_t_Bench(const char* impl) {
     37         fName.append("pack_int_uint16_t_");
     38         fName.append(impl);
     39     }
     40 
     41     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
     42     const char* onGetName() override { return fName.c_str(); }
     43 
     44     void onDraw(int loops, SkCanvas*) override {
     45         __m128i x = _mm_set1_epi32(0x42424242);
     46         while (loops --> 0) {
     47             x = kernel(x);
     48         }
     49 
     50         volatile int blackhole = 0;
     51         blackhole ^= _mm_cvtsi128_si32(x);
     52     }
     53 
     54     SkString fName;
     55 };
     56 
     57 namespace {
     58     __m128i sse2_a(__m128i x) {
     59         x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));
     60         return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000));
     61     }
     62 }
     63 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )
     64 
     65 namespace {
     66     __m128i sse2_b(__m128i x) {
     67         x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
     68         return _mm_packs_epi32(x,x);
     69     }
     70 }
     71 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )
     72 
     73 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     74 namespace {
     75     __m128i ssse3(__m128i x) {
     76         // TODO: Can we force the bench to load the mask inside the loop?  Would be more realistic.
     77         const int _ = ~0;
     78         return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
     79     }
     80 }
     81 DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )
     82 #endif
     83 
     84 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
     85 namespace {
     86     __m128i sse41(__m128i x) {
     87         return _mm_packus_epi32(x,x);
     88     }
     89 }
     90 DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )
     91 #endif
     92 
     93 #endif  // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     94