1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // common.h: contains stuff that's used throughout gemmlowp 16 // and should always be available. 17 18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_ 19 #define GEMMLOWP_INTERNAL_COMMON_H_ 20 21 #include <pthread.h> 22 23 #include <algorithm> 24 #include <cassert> 25 #include <cmath> 26 #include <cstdlib> 27 28 #include "../profiling/instrumentation.h" 29 30 // Our inline assembly path assume GCC/Clang syntax. 31 // Native Client doesn't seem to support inline assembly(?). 32 #if defined(__GNUC__) && !defined(__native_client__) 33 #define GEMMLOWP_ALLOW_INLINE_ASM 34 #endif 35 36 // Define macro statement that avoids inlining for GCC. 37 // For non-GCC, define as empty macro. 38 #if defined(__GNUC__) 39 #define GEMMLOWP_NOINLINE __attribute__((noinline)) 40 #else 41 #define GEMMLOWP_NOINLINE 42 #endif 43 44 // Detect ARM, 32-bit or 64-bit 45 #ifdef __arm__ 46 #define GEMMLOWP_ARM_32 47 #endif 48 49 #ifdef __aarch64__ 50 #define GEMMLOWP_ARM_64 51 #endif 52 53 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64) 54 #define GEMMLOWP_ARM 55 #endif 56 57 // Detect x86, 32-bit or 64-bit 58 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) 59 #define GEMMLOWP_X86_32 60 #endif 61 62 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) 63 #define GEMMLOWP_X86_64 64 #endif 65 66 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64) 67 #define GEMMLOWP_X86 68 #endif 69 70 // Some of our optimized paths use inline assembly and for 71 // now we don't bother enabling some other optimized paths using intrinddics 72 // where we can't use inline assembly paths. 73 #ifdef GEMMLOWP_ALLOW_INLINE_ASM 74 75 // Detect NEON. It's important to check for both tokens. 76 #if (defined __ARM_NEON) || (defined __ARM_NEON__) 77 #define GEMMLOWP_NEON 78 #endif 79 80 // Convenience NEON tokens for 32-bit or 64-bit 81 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32) 82 #define GEMMLOWP_NEON_32 83 #endif 84 85 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64) 86 #define GEMMLOWP_NEON_64 87 #endif 88 89 // Detect SSE4. 90 #if defined __SSE4_1__ 91 #define GEMMLOWP_SSE4 92 #endif 93 94 // Convenience SSE4 tokens for 32-bit or 64-bit 95 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) 96 #define GEMMLOWP_SSE4_32 97 #endif 98 99 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) 100 #define GEMMLOWP_SSE4_64 101 #endif 102 103 #endif // GEMMLOWP_ALLOW_INLINE_ASM 104 105 // Detect Android. Don't conflate with ARM - we care about tuning 106 // for non-ARM Android devices too. This can be used in conjunction 107 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. 108 #if defined(__ANDROID__) || defined(ANDROID) 109 #define GEMMLOWP_ANDROID 110 #endif 111 112 namespace gemmlowp { 113 114 // Standard cache line size. Useful to optimize alignment and 115 // prefetches. Ideally we would query this at runtime, however 116 // 64 byte cache lines are the vast majority, and even if it's 117 // wrong on some device, it will be wrong by no more than a 2x factor, 118 // which should be acceptable. 119 const int kDefaultCacheLineSize = 64; 120 121 // Default L1 and L2 data cache sizes. 122 // The L1 cache size is assumed to be for each core. 123 // The L2 cache size is assumed to be shared among all cores. What 124 // we call 'L2' here is effectively top-level cache. 125 // 126 // On x86, we should ideally query this at 127 // runtime. On ARM, the instruction to query this is privileged and 128 // Android kernels do not expose it to userspace. Fortunately, the majority 129 // of ARM devices have roughly comparable values: 130 // Nexus 5: L1 16k, L2 1M 131 // Android One: L1 32k, L2 512k 132 // The following values are equal to or somewhat lower than that, and were 133 // found to perform well on both the Nexus 5 and Android One. 134 // Of course, these values are in principle too low for typical x86 CPUs 135 // where we should set the L2 value to (L3 cache size / number of cores) at 136 // least. 137 #if defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID) 138 // ARM or ARM-like hardware (Android implies ARM-like) so here it's OK 139 // to tune for ARM, although on x86 Atom we might be able to query 140 // cache sizes at runtime, which would be better. 141 const int kDefaultL1CacheSize = 16 * 1024; 142 const int kDefaultL2CacheSize = 384 * 1024; 143 #elif defined(GEMMLOWP_X86_64) 144 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware. 145 // Thus we assume larger cache sizes, though we really should query 146 // them at runtime. 147 const int kDefaultL1CacheSize = 32 * 1024; 148 const int kDefaultL2CacheSize = 4 * 1024 * 1024; 149 #elif defined(GEMMLOWP_X86_32) 150 // x86-32 and not Android. Same as x86-64 but less bullish. 151 const int kDefaultL1CacheSize = 32 * 1024; 152 const int kDefaultL2CacheSize = 2 * 1024 * 1024; 153 #else 154 // Less common hardware. Maybe some unusual or older or embedded thing. 155 // Assume smaller caches, but don't depart too far from what we do 156 // on ARM/Android to avoid accidentally exposing unexpected behavior. 157 const int kDefaultL1CacheSize = 16 * 1024; 158 const int kDefaultL2CacheSize = 256 * 1024; 159 #endif 160 161 // The proportion of the cache that we intend to use for storing 162 // RHS blocks. This should be between 0 and 1, and typically closer to 1, 163 // as we typically want to use most of the L2 cache for storing a large 164 // RHS block. 165 #if defined(GEMMLOWP_X86) 166 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked 167 // for L2 cache. 168 const float kDefaultL2RhsFactor = 1.00f; 169 #else 170 const float kDefaultL2RhsFactor = 0.75f; 171 #endif 172 173 // The number of bytes in a SIMD register. This is used to determine 174 // the dimensions of PackingRegisterBlock so that such blocks can 175 // be efficiently loaded into registers, so that packing code can 176 // work within registers as much as possible. 177 // In the non-SIMD generic fallback code, this is just a generic array 178 // size, so any size would work there. Different platforms may set this 179 // to different values but must ensure that their own optimized packing paths 180 // are consistent with this value. 181 const int kRegisterSize = 16; 182 183 // Requantization to less-than-8-bit is costly, so it only worth 184 // doing if the GEMM width is large enough 185 const int kMinimumWidthForRequantization = 100; 186 187 // Hints the CPU to prefetch the cache line containing ptr. 188 inline void Prefetch(const void* ptr) { 189 #ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. 190 __builtin_prefetch(ptr); 191 #else 192 (void)ptr; 193 #endif 194 } 195 196 // Returns the runtime argument rounded down to the nearest multiple of 197 // the fixed Modulus. 198 template <unsigned Modulus, typename Integer> 199 Integer RoundDown(Integer i) { 200 return i - (i % Modulus); 201 } 202 203 // Returns the runtime argument rounded up to the nearest multiple of 204 // the fixed Modulus. 205 template <unsigned Modulus, typename Integer> 206 Integer RoundUp(Integer i) { 207 return RoundDown<Modulus>(i + Modulus - 1); 208 } 209 210 // Returns the quotient a / b rounded up ('ceil') to the nearest integer. 211 template <typename Integer> 212 Integer CeilQuotient(Integer a, Integer b) { 213 return (a + b - 1) / b; 214 } 215 216 // Returns the argument rounded up to the nearest power of two. 217 template <typename Integer> 218 Integer RoundUpToPowerOfTwo(Integer n) { 219 Integer i = n - 1; 220 i |= i >> 1; 221 i |= i >> 2; 222 i |= i >> 4; 223 i |= i >> 8; 224 i |= i >> 16; 225 return i + 1; 226 } 227 228 template <int N> 229 struct IsPowerOfTwo { 230 static const bool value = !(N & (N - 1)); 231 }; 232 233 } // namespace gemmlowp 234 235 #endif // GEMMLOWP_INTERNAL_COMMON_H_ 236