1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <stddef.h> 17 #include <stdint.h> 18 19 // SSE4.2 accelerated CRC32c. 20 21 // See if the SSE4.2 crc32c instruction is available. 22 #undef USE_SSE_CRC32C 23 #ifdef __SSE4_2__ 24 #if defined(__x86_64__) && defined(__GNUC__) && \ 25 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) 26 #define USE_SSE_CRC32C 1 27 #elif defined(__x86_64__) && defined(__clang__) 28 #if __has_builtin(__builtin_cpu_supports) 29 #define USE_SSE_CRC32C 1 30 #endif 31 #endif 32 #endif /* __SSE4_2__ */ 33 34 // This version of Apple clang has a bug: 35 // https://llvm.org/bugs/show_bug.cgi?id=25510 36 #if defined(__APPLE__) && (__clang_major__ <= 8) 37 #undef USE_SSE_CRC32C 38 #endif 39 40 #ifdef USE_SSE_CRC32C 41 #include <nmmintrin.h> 42 #endif 43 44 namespace tensorflow { 45 namespace crc32c { 46 47 #ifndef USE_SSE_CRC32C 48 49 bool CanAccelerate() { return false; } 50 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) { 51 // Should not be called. 52 return 0; 53 } 54 55 #else 56 57 // SSE4.2 optimized crc32c computation. 58 bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); } 59 60 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) { 61 const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); 62 const uint8_t *e = p + size; 63 uint32_t l = crc ^ 0xffffffffu; 64 65 // Advance p until aligned to 8-bytes.. 66 // Point x at first 7-byte aligned byte in string. This might be 67 // just past the end of the string. 68 const uintptr_t pval = reinterpret_cast<uintptr_t>(p); 69 const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3); 70 if (x <= e) { 71 // Process bytes until finished or p is 8-byte aligned 72 while (p != x) { 73 l = _mm_crc32_u8(l, *p); 74 p++; 75 } 76 } 77 78 // Process bytes 16 at a time 79 uint64_t l64 = l; 80 while ((e - p) >= 16) { 81 l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p)); 82 l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8)); 83 p += 16; 84 } 85 86 // Process remaining bytes one at a time. 87 l = l64; 88 while (p < e) { 89 l = _mm_crc32_u8(l, *p); 90 p++; 91 } 92 93 return l ^ 0xffffffffu; 94 } 95 96 #endif 97 98 } // namespace crc32c 99 } // namespace tensorflow 100