Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // common.h: contains stuff that's used throughout gemmlowp
     16 // and should always be available.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
     19 #define GEMMLOWP_INTERNAL_COMMON_H_
     20 
     21 #include <pthread.h>
     22 
     23 #include <algorithm>
     24 #include <cassert>
     25 #include <cmath>
     26 #include <cstdlib>
     27 
     28 #include "../profiling/instrumentation.h"
     29 
     30 // Our inline assembly path assume GCC/Clang syntax.
     31 // Native Client doesn't seem to support inline assembly(?).
     32 #if defined(__GNUC__) && !defined(__native_client__)
     33 #define GEMMLOWP_ALLOW_INLINE_ASM
     34 #endif
     35 
     36 // Define macro statement that avoids inlining for GCC.
     37 // For non-GCC, define as empty macro.
     38 #if defined(__GNUC__)
     39 #define GEMMLOWP_NOINLINE __attribute__((noinline))
     40 #else
     41 #define GEMMLOWP_NOINLINE
     42 #endif
     43 
     44 // Detect ARM, 32-bit or 64-bit
     45 #ifdef __arm__
     46 #define GEMMLOWP_ARM_32
     47 #endif
     48 
     49 #ifdef __aarch64__
     50 #define GEMMLOWP_ARM_64
     51 #endif
     52 
     53 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
     54 #define GEMMLOWP_ARM
     55 #endif
     56 
     57 // Detect x86, 32-bit or 64-bit
     58 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
     59 #define GEMMLOWP_X86_32
     60 #endif
     61 
     62 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
     63 #define GEMMLOWP_X86_64
     64 #endif
     65 
     66 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
     67 #define GEMMLOWP_X86
     68 #endif
     69 
     70 // Some of our optimized paths use inline assembly and for
     71 // now we don't bother enabling some other optimized paths using intrinddics
     72 // where we can't use inline assembly paths.
     73 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
     74 
     75 // Detect NEON. It's important to check for both tokens.
     76 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
     77 #define GEMMLOWP_NEON
     78 #endif
     79 
     80 // Convenience NEON tokens for 32-bit or 64-bit
     81 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
     82 #define GEMMLOWP_NEON_32
     83 #endif
     84 
     85 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
     86 #define GEMMLOWP_NEON_64
     87 #endif
     88 
     89 // Detect SSE4.
     90 #if defined __SSE4_1__
     91 #define GEMMLOWP_SSE4
     92 #endif
     93 
     94 // Convenience SSE4 tokens for 32-bit or 64-bit
     95 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32)
     96 #define GEMMLOWP_SSE4_32
     97 #endif
     98 
     99 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64)
    100 #define GEMMLOWP_SSE4_64
    101 #endif
    102 
    103 #endif  // GEMMLOWP_ALLOW_INLINE_ASM
    104 
    105 // Detect Android. Don't conflate with ARM - we care about tuning
    106 // for non-ARM Android devices too. This can be used in conjunction
    107 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
    108 #if defined(__ANDROID__) || defined(ANDROID)
    109 #define GEMMLOWP_ANDROID
    110 #endif
    111 
    112 namespace gemmlowp {
    113 
    114 // Standard cache line size. Useful to optimize alignment and
    115 // prefetches. Ideally we would query this at runtime, however
    116 // 64 byte cache lines are the vast majority, and even if it's
    117 // wrong on some device, it will be wrong by no more than a 2x factor,
    118 // which should be acceptable.
    119 const int kDefaultCacheLineSize = 64;
    120 
    121 // Default L1 and L2 data cache sizes.
    122 // The L1 cache size is assumed to be for each core.
    123 // The L2 cache size is assumed to be shared among all cores. What
    124 // we call 'L2' here is effectively top-level cache.
    125 //
    126 // On x86, we should ideally query this at
    127 // runtime. On ARM, the instruction to query this is privileged and
    128 // Android kernels do not expose it to userspace. Fortunately, the majority
    129 // of ARM devices have roughly comparable values:
    130 //   Nexus 5: L1 16k, L2 1M
    131 //   Android One: L1 32k, L2 512k
    132 // The following values are equal to or somewhat lower than that, and were
    133 // found to perform well on both the Nexus 5 and Android One.
    134 // Of course, these values are in principle too low for typical x86 CPUs
    135 // where we should set the L2 value to (L3 cache size / number of cores) at
    136 // least.
    137 #if defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
    138 // ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
    139 // to tune for ARM, although on x86 Atom we might be able to query
    140 // cache sizes at runtime, which would be better.
    141 const int kDefaultL1CacheSize = 16 * 1024;
    142 const int kDefaultL2CacheSize = 384 * 1024;
    143 #elif defined(GEMMLOWP_X86_64)
    144 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
    145 // Thus we assume larger cache sizes, though we really should query
    146 // them at runtime.
    147 const int kDefaultL1CacheSize = 32 * 1024;
    148 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
    149 #elif defined(GEMMLOWP_X86_32)
    150 // x86-32 and not Android. Same as x86-64 but less bullish.
    151 const int kDefaultL1CacheSize = 32 * 1024;
    152 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
    153 #else
    154 // Less common hardware. Maybe some unusual or older or embedded thing.
    155 // Assume smaller caches, but don't depart too far from what we do
    156 // on ARM/Android to avoid accidentally exposing unexpected behavior.
    157 const int kDefaultL1CacheSize = 16 * 1024;
    158 const int kDefaultL2CacheSize = 256 * 1024;
    159 #endif
    160 
    161 // The proportion of the cache that we intend to use for storing
    162 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
    163 // as we typically want to use most of the L2 cache for storing a large
    164 // RHS block.
    165 #if defined(GEMMLOWP_X86)
    166 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
    167 // for L2 cache.
    168 const float kDefaultL2RhsFactor = 1.00f;
    169 #else
    170 const float kDefaultL2RhsFactor = 0.75f;
    171 #endif
    172 
    173 // The number of bytes in a SIMD register. This is used to determine
    174 // the dimensions of PackingRegisterBlock so that such blocks can
    175 // be efficiently loaded into registers, so that packing code can
    176 // work within registers as much as possible.
    177 // In the non-SIMD generic fallback code, this is just a generic array
    178 // size, so any size would work there. Different platforms may set this
    179 // to different values but must ensure that their own optimized packing paths
    180 // are consistent with this value.
    181 const int kRegisterSize = 16;
    182 
    183 // Requantization to less-than-8-bit is costly, so it only worth
    184 // doing if the GEMM width is large enough
    185 const int kMinimumWidthForRequantization = 100;
    186 
    187 // Hints the CPU to prefetch the cache line containing ptr.
    188 inline void Prefetch(const void* ptr) {
    189 #ifdef __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
    190   __builtin_prefetch(ptr);
    191 #else
    192   (void)ptr;
    193 #endif
    194 }
    195 
    196 // Returns the runtime argument rounded down to the nearest multiple of
    197 // the fixed Modulus.
    198 template <unsigned Modulus, typename Integer>
    199 Integer RoundDown(Integer i) {
    200   return i - (i % Modulus);
    201 }
    202 
    203 // Returns the runtime argument rounded up to the nearest multiple of
    204 // the fixed Modulus.
    205 template <unsigned Modulus, typename Integer>
    206 Integer RoundUp(Integer i) {
    207   return RoundDown<Modulus>(i + Modulus - 1);
    208 }
    209 
    210 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
    211 template <typename Integer>
    212 Integer CeilQuotient(Integer a, Integer b) {
    213   return (a + b - 1) / b;
    214 }
    215 
    216 // Returns the argument rounded up to the nearest power of two.
    217 template <typename Integer>
    218 Integer RoundUpToPowerOfTwo(Integer n) {
    219   Integer i = n - 1;
    220   i |= i >> 1;
    221   i |= i >> 2;
    222   i |= i >> 4;
    223   i |= i >> 8;
    224   i |= i >> 16;
    225   return i + 1;
    226 }
    227 
    228 template <int N>
    229 struct IsPowerOfTwo {
    230   static const bool value = !(N & (N - 1));
    231 };
    232 
    233 }  // namespace gemmlowp
    234 
    235 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
    236