Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // common.h: contains stuff that's used throughout gemmlowp
     16 // and should always be available.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
     19 #define GEMMLOWP_INTERNAL_COMMON_H_
     20 
     21 #include "../internal/platform.h"
     22 #include "../profiling/pthread_everywhere.h"
     23 
     24 #include <algorithm>
     25 #include <cassert>
     26 #include <cmath>
     27 #include <cstdlib>
     28 
     29 #include "../profiling/instrumentation.h"
     30 
     31 // Our inline assembly path assume GCC/Clang syntax.
     32 // Native Client doesn't seem to support inline assembly(?).
     33 #if defined(__GNUC__) && !defined(__native_client__)
     34 #define GEMMLOWP_ALLOW_INLINE_ASM
     35 #endif
     36 
     37 // Define macro statement that avoids inlining for GCC.
     38 // For non-GCC, define as empty macro.
     39 #if defined(__GNUC__)
     40 #define GEMMLOWP_NOINLINE __attribute__((noinline))
     41 #else
     42 #define GEMMLOWP_NOINLINE
     43 #endif
     44 
     45 // Detect ARM, 32-bit or 64-bit
     46 #ifdef __arm__
     47 #define GEMMLOWP_ARM_32
     48 #endif
     49 
     50 #ifdef __aarch64__
     51 #define GEMMLOWP_ARM_64
     52 #endif
     53 
     54 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
     55 #define GEMMLOWP_ARM
     56 #endif
     57 
     58 // Detect MIPS, 32-bit or 64-bit
     59 #if defined(__mips) && !defined(__LP64__)
     60 #define GEMMLOWP_MIPS_32
     61 #endif
     62 
     63 #if defined(__mips) && defined(__LP64__)
     64 #define GEMMLOWP_MIPS_64
     65 #endif
     66 
     67 #if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64)
     68 #define GEMMLOWP_MIPS
     69 #endif
     70 
     71 // Detect x86, 32-bit or 64-bit
     72 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
     73 #define GEMMLOWP_X86_32
     74 #endif
     75 
     76 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
     77 #define GEMMLOWP_X86_64
     78 #endif
     79 
     80 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
     81 #define GEMMLOWP_X86
     82 #endif
     83 
     84 // Some of our optimized paths use inline assembly and for
     85 // now we don't bother enabling some other optimized paths using intrinddics
     86 // where we can't use inline assembly paths.
     87 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
     88 
     89 // Detect NEON. It's important to check for both tokens.
     90 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
     91 #define GEMMLOWP_NEON
     92 #endif
     93 
     94 // Convenience NEON tokens for 32-bit or 64-bit
     95 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
     96 #define GEMMLOWP_NEON_32
     97 #endif
     98 
     99 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
    100 #define GEMMLOWP_NEON_64
    101 #endif
    102 
    103 // Detect MIPS MSA.
    104 // Limit MSA optimizations to little-endian CPUs for now.
    105 // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
    106 #if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \
    107     defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
    108 #define GEMMLOWP_MSA
    109 #endif
    110 
    111 // Convenience MIPS MSA tokens for 32-bit or 64-bit.
    112 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32)
    113 #define GEMMLOWP_MSA_32
    114 #endif
    115 
    116 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64)
    117 #define GEMMLOWP_MSA_64
    118 #endif
    119 
    120 // Detect SSE.
    121 #ifdef __SSE4_1__
    122 #define GEMMLOWP_SSE4
    123 #endif
    124 
    125 #ifdef __SSE3__
    126 #define GEMMLOWP_SSE3
    127 #endif
    128 
    129 // Convenience SSE4 tokens for 32-bit or 64-bit
    130 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \
    131    !defined(GEMMLOWP_DISABLE_SSE4)
    132 #define GEMMLOWP_SSE4_32
    133 #endif
    134 
    135 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
    136 #define GEMMLOWP_SSE3_32
    137 #endif
    138 
    139 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \
    140    !defined(GEMMLOWP_DISABLE_SSE4)
    141 #define GEMMLOWP_SSE4_64
    142 #endif
    143 
    144 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
    145 #define GEMMLOWP_SSE3_64
    146 #endif
    147 
    148 #if defined(__has_feature)
    149 #if __has_feature(memory_sanitizer)
    150 #include <sanitizer/msan_interface.h>
    151 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
    152 #elif __has_feature(address_sanitizer)
    153 #include <sanitizer/asan_interface.h>
    154 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
    155 #endif
    156 #endif
    157 
    158 #endif  // GEMMLOWP_ALLOW_INLINE_ASM
    159 
    160 // Detect Android. Don't conflate with ARM - we care about tuning
    161 // for non-ARM Android devices too. This can be used in conjunction
    162 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
    163 #if defined(__ANDROID__) || defined(ANDROID)
    164 #define GEMMLOWP_ANDROID
    165 #endif
    166 
    167 namespace gemmlowp {
    168 
    169 // Standard cache line size. Useful to optimize alignment and
    170 // prefetches. Ideally we would query this at runtime, however
    171 // 64 byte cache lines are the vast majority, and even if it's
    172 // wrong on some device, it will be wrong by no more than a 2x factor,
    173 // which should be acceptable.
    174 const int kDefaultCacheLineSize = 64;
    175 
    176 // Default L1 and L2 data cache sizes.
    177 // The L1 cache size is assumed to be for each core.
    178 // The L2 cache size is assumed to be shared among all cores. What
    179 // we call 'L2' here is effectively top-level cache.
    180 //
    181 // On x86, we should ideally query this at
    182 // runtime. On ARM, the instruction to query this is privileged and
    183 // Android kernels do not expose it to userspace. Fortunately, the majority
    184 // of ARM devices have roughly comparable values:
    185 //   Nexus 5: L1 16k, L2 1M
    186 //   Android One: L1 32k, L2 512k
    187 // The following values are equal to or somewhat lower than that, and were
    188 // found to perform well on both the Nexus 5 and Android One.
    189 // Of course, these values are in principle too low for typical x86 CPUs
    190 // where we should set the L2 value to (L3 cache size / number of cores) at
    191 // least.
    192 //
    193 #if defined(GEMMLOWP_ARM) && defined(__APPLE__)
    194 // iPhone/iPad
    195 const int kDefaultL1CacheSize = 48 * 1024;
    196 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
    197 #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
    198 // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
    199 // to tune for ARM, although on x86 Atom we might be able to query
    200 // cache sizes at runtime, which would be better.
    201 const int kDefaultL1CacheSize = 16 * 1024;
    202 const int kDefaultL2CacheSize = 384 * 1024;
    203 #elif defined(GEMMLOWP_X86_64)
    204 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
    205 // Thus we assume larger cache sizes, though we really should query
    206 // them at runtime.
    207 const int kDefaultL1CacheSize = 32 * 1024;
    208 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
    209 #elif defined(GEMMLOWP_X86_32)
    210 // x86-32 and not Android. Same as x86-64 but less bullish.
    211 const int kDefaultL1CacheSize = 32 * 1024;
    212 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
    213 #elif defined(GEMMLOWP_MIPS)
    214 // MIPS and not Android. TODO: MIPS and Android?
    215 const int kDefaultL1CacheSize = 32 * 1024;
    216 const int kDefaultL2CacheSize = 1024 * 1024;
    217 #else
    218 // Less common hardware. Maybe some unusual or older or embedded thing.
    219 // Assume smaller caches, but don't depart too far from what we do
    220 // on ARM/Android to avoid accidentally exposing unexpected behavior.
    221 const int kDefaultL1CacheSize = 16 * 1024;
    222 const int kDefaultL2CacheSize = 256 * 1024;
    223 #endif
    224 
    225 // The proportion of the cache that we intend to use for storing
    226 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
    227 // as we typically want to use most of the L2 cache for storing a large
    228 // RHS block.
    229 #if defined(GEMMLOWP_X86)
    230 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
    231 // for L2 cache.
    232 const float kDefaultL2RhsFactor = 1.00f;
    233 #else
    234 const float kDefaultL2RhsFactor = 0.75f;
    235 #endif
    236 
    237 // The number of bytes in a SIMD register. This is used to determine
    238 // the dimensions of PackingRegisterBlock so that such blocks can
    239 // be efficiently loaded into registers, so that packing code can
    240 // work within registers as much as possible.
    241 // In the non-SIMD generic fallback code, this is just a generic array
    242 // size, so any size would work there. Different platforms may set this
    243 // to different values but must ensure that their own optimized packing paths
    244 // are consistent with this value.
    245 const int kRegisterSize = 16;
    246 
    247 // Hints the CPU to prefetch the cache line containing ptr.
    248 inline void Prefetch(const void* ptr) {
    249 #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
    250   // Aarch64 has very detailed prefetch instructions, that compilers
    251   // can't know how to map __builtin_prefetch to, and as a result, don't,
    252   // leaving __builtin_prefetch a no-op on this architecture.
    253   // For our purposes, "pldl1keep" is usually what we want, meaning:
    254   // "prefetch for load, into L1 cache, using each value multiple times".
    255   asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
    256 #elif defined \
    257     __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
    258   __builtin_prefetch(ptr);
    259 #else
    260   (void)ptr;
    261 #endif
    262 }
    263 
    264 // Returns the runtime argument rounded down to the nearest multiple of
    265 // the fixed Modulus.
    266 template <unsigned Modulus, typename Integer>
    267 Integer RoundDown(Integer i) {
    268   return i - (i % Modulus);
    269 }
    270 
    271 // Returns the runtime argument rounded up to the nearest multiple of
    272 // the fixed Modulus.
    273 template <unsigned Modulus, typename Integer>
    274 Integer RoundUp(Integer i) {
    275   return RoundDown<Modulus>(i + Modulus - 1);
    276 }
    277 
    278 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
    279 template <typename Integer>
    280 Integer CeilQuotient(Integer a, Integer b) {
    281   return (a + b - 1) / b;
    282 }
    283 
    284 // Returns the argument rounded up to the nearest power of two.
    285 template <typename Integer>
    286 Integer RoundUpToPowerOfTwo(Integer n) {
    287   Integer i = n - 1;
    288   i |= i >> 1;
    289   i |= i >> 2;
    290   i |= i >> 4;
    291   i |= i >> 8;
    292   i |= i >> 16;
    293   return i + 1;
    294 }
    295 
    296 template <int N>
    297 struct IsPowerOfTwo {
    298   static const bool value = !(N & (N - 1));
    299 };
    300 
    301 template <typename T>
    302 void MarkMemoryAsInitialized(T* ptr, int size) {
    303 #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
    304   GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
    305                                       size * sizeof(T));
    306 #else
    307   (void)ptr;
    308   (void)size;
    309 #endif
    310 }
    311 
    312 }  // namespace gemmlowp
    313 
    314 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
    315