1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // common.h: contains stuff that's used throughout gemmlowp 16 // and should always be available. 17 18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_ 19 #define GEMMLOWP_INTERNAL_COMMON_H_ 20 21 #include "../internal/platform.h" 22 #include "../profiling/pthread_everywhere.h" 23 24 #include <algorithm> 25 #include <cassert> 26 #include <cmath> 27 #include <cstdlib> 28 29 #include "../profiling/instrumentation.h" 30 31 // Our inline assembly path assume GCC/Clang syntax. 32 // Native Client doesn't seem to support inline assembly(?). 33 #if defined(__GNUC__) && !defined(__native_client__) 34 #define GEMMLOWP_ALLOW_INLINE_ASM 35 #endif 36 37 // Define macro statement that avoids inlining for GCC. 38 // For non-GCC, define as empty macro. 39 #if defined(__GNUC__) 40 #define GEMMLOWP_NOINLINE __attribute__((noinline)) 41 #else 42 #define GEMMLOWP_NOINLINE 43 #endif 44 45 // Detect ARM, 32-bit or 64-bit 46 #ifdef __arm__ 47 #define GEMMLOWP_ARM_32 48 #endif 49 50 #ifdef __aarch64__ 51 #define GEMMLOWP_ARM_64 52 #endif 53 54 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64) 55 #define GEMMLOWP_ARM 56 #endif 57 58 // Detect MIPS, 32-bit or 64-bit 59 #if defined(__mips) && !defined(__LP64__) 60 #define GEMMLOWP_MIPS_32 61 #endif 62 63 #if defined(__mips) && defined(__LP64__) 64 #define GEMMLOWP_MIPS_64 65 #endif 66 67 #if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64) 68 #define GEMMLOWP_MIPS 69 #endif 70 71 // Detect x86, 32-bit or 64-bit 72 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) 73 #define GEMMLOWP_X86_32 74 #endif 75 76 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) 77 #define GEMMLOWP_X86_64 78 #endif 79 80 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64) 81 #define GEMMLOWP_X86 82 #endif 83 84 // Some of our optimized paths use inline assembly and for 85 // now we don't bother enabling some other optimized paths using intrinddics 86 // where we can't use inline assembly paths. 87 #ifdef GEMMLOWP_ALLOW_INLINE_ASM 88 89 // Detect NEON. It's important to check for both tokens. 90 #if (defined __ARM_NEON) || (defined __ARM_NEON__) 91 #define GEMMLOWP_NEON 92 #endif 93 94 // Convenience NEON tokens for 32-bit or 64-bit 95 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32) 96 #define GEMMLOWP_NEON_32 97 #endif 98 99 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64) 100 #define GEMMLOWP_NEON_64 101 #endif 102 103 // Detect MIPS MSA. 104 // Limit MSA optimizations to little-endian CPUs for now. 105 // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? 106 #if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \ 107 defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 108 #define GEMMLOWP_MSA 109 #endif 110 111 // Convenience MIPS MSA tokens for 32-bit or 64-bit. 112 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32) 113 #define GEMMLOWP_MSA_32 114 #endif 115 116 #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64) 117 #define GEMMLOWP_MSA_64 118 #endif 119 120 // Detect SSE. 121 #ifdef __SSE4_1__ 122 #define GEMMLOWP_SSE4 123 #endif 124 125 #ifdef __SSE3__ 126 #define GEMMLOWP_SSE3 127 #endif 128 129 // Convenience SSE4 tokens for 32-bit or 64-bit 130 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \ 131 !defined(GEMMLOWP_DISABLE_SSE4) 132 #define GEMMLOWP_SSE4_32 133 #endif 134 135 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32) 136 #define GEMMLOWP_SSE3_32 137 #endif 138 139 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \ 140 !defined(GEMMLOWP_DISABLE_SSE4) 141 #define GEMMLOWP_SSE4_64 142 #endif 143 144 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64) 145 #define GEMMLOWP_SSE3_64 146 #endif 147 148 #if defined(__has_feature) 149 #if __has_feature(memory_sanitizer) 150 #include <sanitizer/msan_interface.h> 151 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison 152 #elif __has_feature(address_sanitizer) 153 #include <sanitizer/asan_interface.h> 154 #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region 155 #endif 156 #endif 157 158 #endif // GEMMLOWP_ALLOW_INLINE_ASM 159 160 // Detect Android. Don't conflate with ARM - we care about tuning 161 // for non-ARM Android devices too. This can be used in conjunction 162 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. 163 #if defined(__ANDROID__) || defined(ANDROID) 164 #define GEMMLOWP_ANDROID 165 #endif 166 167 namespace gemmlowp { 168 169 // Standard cache line size. Useful to optimize alignment and 170 // prefetches. Ideally we would query this at runtime, however 171 // 64 byte cache lines are the vast majority, and even if it's 172 // wrong on some device, it will be wrong by no more than a 2x factor, 173 // which should be acceptable. 174 const int kDefaultCacheLineSize = 64; 175 176 // Default L1 and L2 data cache sizes. 177 // The L1 cache size is assumed to be for each core. 178 // The L2 cache size is assumed to be shared among all cores. What 179 // we call 'L2' here is effectively top-level cache. 180 // 181 // On x86, we should ideally query this at 182 // runtime. On ARM, the instruction to query this is privileged and 183 // Android kernels do not expose it to userspace. Fortunately, the majority 184 // of ARM devices have roughly comparable values: 185 // Nexus 5: L1 16k, L2 1M 186 // Android One: L1 32k, L2 512k 187 // The following values are equal to or somewhat lower than that, and were 188 // found to perform well on both the Nexus 5 and Android One. 189 // Of course, these values are in principle too low for typical x86 CPUs 190 // where we should set the L2 value to (L3 cache size / number of cores) at 191 // least. 192 // 193 #if defined(GEMMLOWP_ARM) && defined(__APPLE__) 194 // iPhone/iPad 195 const int kDefaultL1CacheSize = 48 * 1024; 196 const int kDefaultL2CacheSize = 2 * 1024 * 1024; 197 #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID) 198 // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK 199 // to tune for ARM, although on x86 Atom we might be able to query 200 // cache sizes at runtime, which would be better. 201 const int kDefaultL1CacheSize = 16 * 1024; 202 const int kDefaultL2CacheSize = 384 * 1024; 203 #elif defined(GEMMLOWP_X86_64) 204 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware. 205 // Thus we assume larger cache sizes, though we really should query 206 // them at runtime. 207 const int kDefaultL1CacheSize = 32 * 1024; 208 const int kDefaultL2CacheSize = 4 * 1024 * 1024; 209 #elif defined(GEMMLOWP_X86_32) 210 // x86-32 and not Android. Same as x86-64 but less bullish. 211 const int kDefaultL1CacheSize = 32 * 1024; 212 const int kDefaultL2CacheSize = 2 * 1024 * 1024; 213 #elif defined(GEMMLOWP_MIPS) 214 // MIPS and not Android. TODO: MIPS and Android? 215 const int kDefaultL1CacheSize = 32 * 1024; 216 const int kDefaultL2CacheSize = 1024 * 1024; 217 #else 218 // Less common hardware. Maybe some unusual or older or embedded thing. 219 // Assume smaller caches, but don't depart too far from what we do 220 // on ARM/Android to avoid accidentally exposing unexpected behavior. 221 const int kDefaultL1CacheSize = 16 * 1024; 222 const int kDefaultL2CacheSize = 256 * 1024; 223 #endif 224 225 // The proportion of the cache that we intend to use for storing 226 // RHS blocks. This should be between 0 and 1, and typically closer to 1, 227 // as we typically want to use most of the L2 cache for storing a large 228 // RHS block. 229 #if defined(GEMMLOWP_X86) 230 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked 231 // for L2 cache. 232 const float kDefaultL2RhsFactor = 1.00f; 233 #else 234 const float kDefaultL2RhsFactor = 0.75f; 235 #endif 236 237 // The number of bytes in a SIMD register. This is used to determine 238 // the dimensions of PackingRegisterBlock so that such blocks can 239 // be efficiently loaded into registers, so that packing code can 240 // work within registers as much as possible. 241 // In the non-SIMD generic fallback code, this is just a generic array 242 // size, so any size would work there. Different platforms may set this 243 // to different values but must ensure that their own optimized packing paths 244 // are consistent with this value. 245 const int kRegisterSize = 16; 246 247 // Hints the CPU to prefetch the cache line containing ptr. 248 inline void Prefetch(const void* ptr) { 249 #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM 250 // Aarch64 has very detailed prefetch instructions, that compilers 251 // can't know how to map __builtin_prefetch to, and as a result, don't, 252 // leaving __builtin_prefetch a no-op on this architecture. 253 // For our purposes, "pldl1keep" is usually what we want, meaning: 254 // "prefetch for load, into L1 cache, using each value multiple times". 255 asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); 256 #elif defined \ 257 __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. 258 __builtin_prefetch(ptr); 259 #else 260 (void)ptr; 261 #endif 262 } 263 264 // Returns the runtime argument rounded down to the nearest multiple of 265 // the fixed Modulus. 266 template <unsigned Modulus, typename Integer> 267 Integer RoundDown(Integer i) { 268 return i - (i % Modulus); 269 } 270 271 // Returns the runtime argument rounded up to the nearest multiple of 272 // the fixed Modulus. 273 template <unsigned Modulus, typename Integer> 274 Integer RoundUp(Integer i) { 275 return RoundDown<Modulus>(i + Modulus - 1); 276 } 277 278 // Returns the quotient a / b rounded up ('ceil') to the nearest integer. 279 template <typename Integer> 280 Integer CeilQuotient(Integer a, Integer b) { 281 return (a + b - 1) / b; 282 } 283 284 // Returns the argument rounded up to the nearest power of two. 285 template <typename Integer> 286 Integer RoundUpToPowerOfTwo(Integer n) { 287 Integer i = n - 1; 288 i |= i >> 1; 289 i |= i >> 2; 290 i |= i >> 4; 291 i |= i >> 8; 292 i |= i >> 16; 293 return i + 1; 294 } 295 296 template <int N> 297 struct IsPowerOfTwo { 298 static const bool value = !(N & (N - 1)); 299 }; 300 301 template <typename T> 302 void MarkMemoryAsInitialized(T* ptr, int size) { 303 #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED 304 GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr), 305 size * sizeof(T)); 306 #else 307 (void)ptr; 308 (void)size; 309 #endif 310 } 311 312 } // namespace gemmlowp 313 314 #endif // GEMMLOWP_INTERNAL_COMMON_H_ 315