Home | History | Annotate | Download | only in hash
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <stddef.h>
     17 #include <stdint.h>
     18 
     19 // SSE4.2 accelerated CRC32c.
     20 
     21 // See if the SSE4.2 crc32c instruction is available.
     22 #undef USE_SSE_CRC32C
     23 #ifdef __SSE4_2__
     24 #if defined(__x86_64__) && defined(__GNUC__) && \
     25     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
     26 #define USE_SSE_CRC32C 1
     27 #elif defined(__x86_64__) && defined(__clang__)
     28 #if __has_builtin(__builtin_cpu_supports)
     29 #define USE_SSE_CRC32C 1
     30 #endif
     31 #endif
     32 #endif /* __SSE4_2__ */
     33 
     34 // This version of Apple clang has a bug:
     35 // https://llvm.org/bugs/show_bug.cgi?id=25510
     36 #if defined(__APPLE__) && (__clang_major__ <= 8)
     37 #undef USE_SSE_CRC32C
     38 #endif
     39 
     40 #ifdef USE_SSE_CRC32C
     41 #include <nmmintrin.h>
     42 #endif
     43 
     44 namespace tensorflow {
     45 namespace crc32c {
     46 
     47 #ifndef USE_SSE_CRC32C
     48 
     49 bool CanAccelerate() { return false; }
     50 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
     51   // Should not be called.
     52   return 0;
     53 }
     54 
     55 #else
     56 
     57 // SSE4.2 optimized crc32c computation.
     58 bool CanAccelerate() { return __builtin_cpu_supports("sse4.2"); }
     59 
     60 uint32_t AcceleratedExtend(uint32_t crc, const char *buf, size_t size) {
     61   const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
     62   const uint8_t *e = p + size;
     63   uint32_t l = crc ^ 0xffffffffu;
     64 
     65   // Advance p until aligned to 8-bytes..
     66   // Point x at first 7-byte aligned byte in string.  This might be
     67   // just past the end of the string.
     68   const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
     69   const uint8_t *x = reinterpret_cast<const uint8_t *>(((pval + 7) >> 3) << 3);
     70   if (x <= e) {
     71     // Process bytes until finished or p is 8-byte aligned
     72     while (p != x) {
     73       l = _mm_crc32_u8(l, *p);
     74       p++;
     75     }
     76   }
     77 
     78   // Process bytes 16 at a time
     79   uint64_t l64 = l;
     80   while ((e - p) >= 16) {
     81     l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p));
     82     l64 = _mm_crc32_u64(l64, *reinterpret_cast<const uint64_t *>(p + 8));
     83     p += 16;
     84   }
     85 
     86   // Process remaining bytes one at a time.
     87   l = l64;
     88   while (p < e) {
     89     l = _mm_crc32_u8(l, *p);
     90     p++;
     91   }
     92 
     93   return l ^ 0xffffffffu;
     94 }
     95 
     96 #endif
     97 
     98 }  // namespace crc32c
     99 }  // namespace tensorflow
    100