1 // Fast memory copying and comparison routines. 2 // strings::fastmemcmp_inlined() replaces memcmp() 3 // strings::memcpy_inlined() replaces memcpy() 4 // strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 5 // 6 // strings::*_inlined() routines are inline versions of the 7 // routines exported by this module. Sometimes using the inlined 8 // versions is faster. Measure before using the inlined versions. 9 10 #ifndef DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_ // NOLINT 11 #define DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_ // NOLINT 12 13 #include <stddef.h> 14 #include <stdint.h> 15 #include <stdio.h> 16 #include <string.h> 17 18 #include "base/integral_types.h" 19 #include "base/macros.h" 20 #include "base/port.h" 21 22 namespace dynamic_depth { 23 namespace strings { 24 25 // Return true if the n bytes at a equal the n bytes at b. 26 // The regions are allowed to overlap. 27 // 28 // The performance is similar to the performance of memcmp(), but faster for 29 // moderately-sized inputs, or inputs that share a common prefix and differ 30 // somewhere in their last 8 bytes. Further optimizations can be added later 31 // if it makes sense to do so. Alternatively, if the compiler & runtime improve 32 // to eliminate the need for this, we can remove it. Please keep this in sync 33 // with google_internal::gg_memeq() in //third_party/stl/gcc3/string. 34 inline bool memeq(const char* a, const char* b, size_t n) { 35 size_t n_rounded_down = n & ~static_cast<size_t>(7); 36 if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 37 return memcmp(a, b, n) == 0; 38 } 39 // n >= 8 40 uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); 41 uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); 42 if ((u | v) != 0) { // The first or last 8 bytes differ. 43 return false; 44 } 45 // The next line forces n to be a multiple of 8. 46 n = n_rounded_down; 47 if (n >= 80) { 48 // In 2013 or later, this should be fast on long strings. 49 return memcmp(a, b, n) == 0; 50 } 51 // Now force n to be a multiple of 16. Arguably, a "switch" would be smart 52 // here, but there's a difficult-to-evaluate code size vs. speed issue. The 53 // current approach often re-compares some bytes (worst case is if n initially 54 // was 16, 32, 48, or 64), but is fairly short. 55 size_t e = n & 8; 56 a += e; 57 b += e; 58 n -= e; 59 // n is now in {0, 16, 32, ...}. Process 0 or more 16-byte chunks. 60 while (n > 0) { 61 uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); 62 uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); 63 if ((x | y) != 0) { 64 return false; 65 } 66 a += 16; 67 b += 16; 68 n -= 16; 69 } 70 return true; 71 } 72 73 inline int fastmemcmp_inlined(const void* va, const void* vb, size_t n) { 74 const unsigned char* pa = static_cast<const unsigned char*>(va); 75 const unsigned char* pb = static_cast<const unsigned char*>(vb); 76 switch (n) { 77 default: 78 return memcmp(va, vb, n); 79 case 7: 80 if (*pa != *pb) return *pa < *pb ? -1 : +1; 81 ++pa; 82 ++pb; 83 FALLTHROUGH_INTENDED; 84 case 6: 85 if (*pa != *pb) return *pa < *pb ? -1 : +1; 86 ++pa; 87 ++pb; 88 FALLTHROUGH_INTENDED; 89 case 5: 90 if (*pa != *pb) return *pa < *pb ? -1 : +1; 91 ++pa; 92 ++pb; 93 FALLTHROUGH_INTENDED; 94 case 4: 95 if (*pa != *pb) return *pa < *pb ? -1 : +1; 96 ++pa; 97 ++pb; 98 FALLTHROUGH_INTENDED; 99 case 3: 100 if (*pa != *pb) return *pa < *pb ? -1 : +1; 101 ++pa; 102 ++pb; 103 FALLTHROUGH_INTENDED; 104 case 2: 105 if (*pa != *pb) return *pa < *pb ? -1 : +1; 106 ++pa; 107 ++pb; 108 FALLTHROUGH_INTENDED; 109 case 1: 110 if (*pa != *pb) return *pa < *pb ? -1 : +1; 111 FALLTHROUGH_INTENDED; 112 case 0: 113 break; 114 } 115 return 0; 116 } 117 118 // The standard memcpy operation is slow for variable small sizes. 119 // This implementation inlines the optimal realization for sizes 1 to 16. 120 // To avoid code bloat don't use it in case of not performance-critical spots, 121 // nor when you don't expect very frequent values of size <= 16. 122 inline void memcpy_inlined(char* dst, const char* src, size_t size) { 123 // Compiler inlines code with minimal amount of data movement when third 124 // parameter of memcpy is a constant. 125 switch (size) { 126 case 1: 127 memcpy(dst, src, 1); 128 break; 129 case 2: 130 memcpy(dst, src, 2); 131 break; 132 case 3: 133 memcpy(dst, src, 3); 134 break; 135 case 4: 136 memcpy(dst, src, 4); 137 break; 138 case 5: 139 memcpy(dst, src, 5); 140 break; 141 case 6: 142 memcpy(dst, src, 6); 143 break; 144 case 7: 145 memcpy(dst, src, 7); 146 break; 147 case 8: 148 memcpy(dst, src, 8); 149 break; 150 case 9: 151 memcpy(dst, src, 9); 152 break; 153 case 10: 154 memcpy(dst, src, 10); 155 break; 156 case 11: 157 memcpy(dst, src, 11); 158 break; 159 case 12: 160 memcpy(dst, src, 12); 161 break; 162 case 13: 163 memcpy(dst, src, 13); 164 break; 165 case 14: 166 memcpy(dst, src, 14); 167 break; 168 case 15: 169 memcpy(dst, src, 15); 170 break; 171 case 16: 172 memcpy(dst, src, 16); 173 break; 174 default: 175 memcpy(dst, src, size); 176 break; 177 } 178 } 179 180 } // namespace strings 181 } // namespace dynamic_depth 182 183 #endif // DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_ // NOLINT 184