1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/lib/strings/ordered_code.h" 17 18 #include <assert.h> 19 #include <stddef.h> 20 21 #include "tensorflow/core/lib/core/stringpiece.h" 22 #include "tensorflow/core/platform/logging.h" 23 24 namespace tensorflow { 25 namespace strings { 26 27 // We encode a string in different ways depending on whether the item 28 // should be in lexicographically increasing or decreasing order. 29 // 30 // 31 // Lexicographically increasing order 32 // 33 // We want a string-to-string mapping F(x) such that for any two strings 34 // 35 // x < y => F(x) < F(y) 36 // 37 // In addition to the normal characters '\x00' through '\xff', we want to 38 // encode a few extra symbols in strings: 39 // 40 // <sep> Separator between items 41 // <infinity> Infinite string 42 // 43 // Therefore we need an alphabet with at least 258 symbols. Each 44 // character '\1' through '\xfe' is mapped to itself. The other four are 45 // encoded into two-letter sequences starting with '\0' and '\xff': 46 // 47 // <sep> encoded as => \0\1 48 // \0 encoded as => \0\xff 49 // \xff encoded as => \xff\x00 50 // <infinity> encoded as => \xff\xff 51 // 52 // The remaining two-letter sequences starting with '\0' and '\xff' are 53 // currently unused. 54 // 55 // F(<infinity>) is defined above. For any finite string x, F(x) is the 56 // the encodings of x's characters followed by the encoding for <sep>. The 57 // ordering of two finite strings is the same as the ordering of the 58 // respective characters at the first position where they differ, which in 59 // turn is the same as the ordering of the encodings of those two 60 // characters. Moreover, for every finite string x, F(x) < F(<infinity>). 61 // 62 // 63 // Lexicographically decreasing order 64 // 65 // We want a string-to-string mapping G(x) such that for any two strings, 66 // whether finite or not, 67 // 68 // x < y => G(x) > G(y) 69 // 70 // To achieve this, define G(x) to be the inversion of F(x): I(F(x)). In 71 // other words, invert every bit in F(x) to get G(x). For example, 72 // 73 // x = \x00\x13\xff 74 // F(x) = \x00\xff\x13\xff\x00\x00\x01 escape \0, \xff, append F(<sep>) 75 // G(x) = \xff\x00\xec\x00\xff\xff\xfe invert every bit in F(x) 76 // 77 // x = <infinity> 78 // F(x) = \xff\xff 79 // G(x) = \x00\x00 80 // 81 // Another example is 82 // 83 // x F(x) G(x) = I(F(x)) 84 // - ---- -------------- 85 // <infinity> \xff\xff \x00\x00 86 // "foo" foo\0\1 \x99\x90\x90\xff\xfe 87 // "aaa" aaa\0\1 \x9e\x9e\x9e\xff\xfe 88 // "aa" aa\0\1 \x9e\x9e\xff\xfe 89 // "" \0\1 \xff\xfe 90 // 91 // More generally and rigorously, if for any two strings x and y 92 // 93 // F(x) < F(y) => I(F(x)) > I(F(y)) (1) 94 // 95 // it would follow that x < y => G(x) > G(y) because 96 // 97 // x < y => F(x) < F(y) => G(x) = I(F(x)) > I(F(y)) = G(y) 98 // 99 // We now show why (1) is true, in two parts. Notice that for any two 100 // strings x < y, F(x) is *not* a proper prefix of F(y). Suppose x is a 101 // proper prefix of y (say, x="abc" < y="abcd"). F(x) and F(y) diverge at 102 // the F(<sep>) in F(x) (v. F('d') in the example). Suppose x is not a 103 // proper prefix of y (say, x="abce" < y="abd"), F(x) and F(y) diverge at 104 // their respective encodings of the characters where x and y diverge 105 // (F('c') v. F('d')). Finally, if y=<infinity>, we can see that 106 // F(y)=\xff\xff is not the prefix of F(x) for any finite string x, simply 107 // by considering all the possible first characters of F(x). 108 // 109 // Given that F(x) is not a proper prefix F(y), the order of F(x) and F(y) 110 // is determined by the byte where F(x) and F(y) diverge. For example, the 111 // order of F(x)="eefh" and F(y)="eeg" is determined by their third 112 // characters. I(p) inverts each byte in p, which effectively subtracts 113 // each byte from 0xff. So, in this example, I('f') > I('g'), and thus 114 // I(F(x)) > I(F(y)). 115 // 116 // 117 // Implementation 118 // 119 // To implement G(x) efficiently, we use C++ template to instantiate two 120 // versions of the code to produce F(x), one for normal encoding (giving us 121 // F(x)) and one for inverted encoding (giving us G(x) = I(F(x))). 122 123 static const char kEscape1 = '\000'; 124 static const char kNullCharacter = '\xff'; // Combined with kEscape1 125 static const char kSeparator = '\001'; // Combined with kEscape1 126 127 static const char kEscape2 = '\xff'; 128 static const char kFFCharacter = '\000'; // Combined with kEscape2 129 130 static const char kEscape1_Separator[2] = {kEscape1, kSeparator}; 131 132 // Append to "*dest" the "len" bytes starting from "*src". 133 inline static void AppendBytes(string* dest, const char* src, size_t len) { 134 dest->append(src, len); 135 } 136 137 inline bool IsSpecialByte(char c) { 138 return (static_cast<unsigned char>(c + 1)) < 2; 139 } 140 141 // Return a pointer to the first byte in the range "[start..limit)" 142 // whose value is 0 or 255 (kEscape1 or kEscape2). If no such byte 143 // exists in the range, returns "limit". 144 inline const char* SkipToNextSpecialByte(const char* start, const char* limit) { 145 // If these constants were ever changed, this routine needs to change 146 DCHECK_EQ(kEscape1, 0); 147 DCHECK_EQ(kEscape2 & 0xffu, 255u); 148 const char* p = start; 149 while (p < limit && !IsSpecialByte(*p)) { 150 p++; 151 } 152 return p; 153 } 154 155 // Expose SkipToNextSpecialByte for testing purposes 156 const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start, 157 const char* limit) { 158 return SkipToNextSpecialByte(start, limit); 159 } 160 161 // Helper routine to encode "s" and append to "*dest", escaping special 162 // characters. 163 inline static void EncodeStringFragment(string* dest, StringPiece s) { 164 const char* p = s.data(); 165 const char* limit = p + s.size(); 166 const char* copy_start = p; 167 while (true) { 168 p = SkipToNextSpecialByte(p, limit); 169 if (p >= limit) break; // No more special characters that need escaping 170 char c = *(p++); 171 DCHECK(IsSpecialByte(c)); 172 if (c == kEscape1) { 173 AppendBytes(dest, copy_start, p - copy_start - 1); 174 dest->push_back(kEscape1); 175 dest->push_back(kNullCharacter); 176 copy_start = p; 177 } else { 178 assert(c == kEscape2); 179 AppendBytes(dest, copy_start, p - copy_start - 1); 180 dest->push_back(kEscape2); 181 dest->push_back(kFFCharacter); 182 copy_start = p; 183 } 184 } 185 if (p > copy_start) { 186 AppendBytes(dest, copy_start, p - copy_start); 187 } 188 } 189 190 void OrderedCode::WriteString(string* dest, StringPiece s) { 191 EncodeStringFragment(dest, s); 192 AppendBytes(dest, kEscape1_Separator, 2); 193 } 194 195 void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) { 196 // Values are encoded with a single byte length prefix, followed 197 // by the actual value in big-endian format with leading 0 bytes 198 // dropped. 199 unsigned char buf[9]; // 8 bytes for value plus one byte for length 200 int len = 0; 201 while (val > 0) { 202 len++; 203 buf[9 - len] = (val & 0xff); 204 val >>= 8; 205 } 206 buf[9 - len - 1] = len; 207 len++; 208 AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len); 209 } 210 211 // Parse the encoding of a previously encoded string. 212 // If parse succeeds, return true, consume encoding from 213 // "*src", and if result != NULL append the decoded string to "*result". 214 // Otherwise, return false and leave both undefined. 215 inline static bool ReadStringInternal(StringPiece* src, string* result) { 216 const char* start = src->data(); 217 const char* string_limit = src->data() + src->size(); 218 219 // We only scan up to "limit-2" since a valid string must end with 220 // a two character terminator: 'kEscape1 kSeparator' 221 const char* limit = string_limit - 1; 222 const char* copy_start = start; 223 while (true) { 224 start = SkipToNextSpecialByte(start, limit); 225 if (start >= limit) break; // No terminator sequence found 226 const char c = *(start++); 227 // If inversion is required, instead of inverting 'c', we invert the 228 // character constants to which 'c' is compared. We get the same 229 // behavior but save the runtime cost of inverting 'c'. 230 DCHECK(IsSpecialByte(c)); 231 if (c == kEscape1) { 232 if (result) { 233 AppendBytes(result, copy_start, start - copy_start - 1); 234 } 235 // kEscape1 kSeparator ends component 236 // kEscape1 kNullCharacter represents '\0' 237 const char next = *(start++); 238 if (next == kSeparator) { 239 src->remove_prefix(start - src->data()); 240 return true; 241 } else if (next == kNullCharacter) { 242 if (result) { 243 *result += '\0'; 244 } 245 } else { 246 return false; 247 } 248 copy_start = start; 249 } else { 250 assert(c == kEscape2); 251 if (result) { 252 AppendBytes(result, copy_start, start - copy_start - 1); 253 } 254 // kEscape2 kFFCharacter represents '\xff' 255 // kEscape2 kInfinity is an error 256 const char next = *(start++); 257 if (next == kFFCharacter) { 258 if (result) { 259 *result += '\xff'; 260 } 261 } else { 262 return false; 263 } 264 copy_start = start; 265 } 266 } 267 return false; 268 } 269 270 bool OrderedCode::ReadString(StringPiece* src, string* result) { 271 return ReadStringInternal(src, result); 272 } 273 274 bool OrderedCode::ReadNumIncreasing(StringPiece* src, uint64* result) { 275 if (src->empty()) { 276 return false; // Not enough bytes 277 } 278 279 // Decode length byte 280 const size_t len = static_cast<unsigned char>((*src)[0]); 281 282 // If len > 0 and src is longer than 1, the first byte of "payload" 283 // must be non-zero (otherwise the encoding is not minimal). 284 // In opt mode, we don't enforce that encodings must be minimal. 285 DCHECK(0 == len || src->size() == 1 || (*src)[1] != '\0') 286 << "invalid encoding"; 287 288 if (len + 1 > src->size() || len > 8) { 289 return false; // Not enough bytes or too many bytes 290 } 291 292 if (result) { 293 uint64 tmp = 0; 294 for (size_t i = 0; i < len; i++) { 295 tmp <<= 8; 296 tmp |= static_cast<unsigned char>((*src)[1 + i]); 297 } 298 *result = tmp; 299 } 300 src->remove_prefix(len + 1); 301 return true; 302 } 303 304 void OrderedCode::TEST_Corrupt(string* str, int k) { 305 int seen_seps = 0; 306 for (size_t i = 0; i + 1 < str->size(); i++) { 307 if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) { 308 seen_seps++; 309 if (seen_seps == k) { 310 (*str)[i + 1] = kSeparator + 1; 311 return; 312 } 313 } 314 } 315 } 316 317 // Signed number encoding/decoding ///////////////////////////////////// 318 // 319 // The format is as follows: 320 // 321 // The first bit (the most significant bit of the first byte) 322 // represents the sign, 0 if the number is negative and 323 // 1 if the number is >= 0. 324 // 325 // Any unbroken sequence of successive bits with the same value as the sign 326 // bit, up to 9 (the 8th and 9th are the most significant bits of the next 327 // byte), are size bits that count the number of bytes after the first byte. 328 // That is, the total length is between 1 and 10 bytes. 329 // 330 // The value occupies the bits after the sign bit and the "size bits" 331 // till the end of the string, in network byte order. If the number 332 // is negative, the bits are in 2-complement. 333 // 334 // 335 // Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242: 336 // 337 // +---------------+---------------+---------------+---------------+ 338 // 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 339 // +---------------+---------------+---------------+---------------+ 340 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 341 // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 342 // | | | | payload: the remaining bits after the sign and size bits 343 // | | | | and the delimiter bit, the value is 0x424242 344 // | | | | 345 // | size bits: 3 successive bits with the same value as the sign bit 346 // | (followed by a delimiter bit with the opposite value) 347 // | mean that there are 3 bytes after the first byte, 4 total 348 // | 349 // sign bit: 1 means that the number is non-negative 350 // 351 // Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800: 352 // 353 // +---------------+---------------+ 354 // 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 355 // +---------------+---------------+ 356 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ 357 // | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 358 // | | payload: the remaining bits after the sign and size bits and the 359 // | | delimiter bit, 2-complement because of the negative sign, 360 // | | value is ~0x7ff, represents the value -0x800 361 // | | 362 // | size bits: 1 bit with the same value as the sign bit 363 // | (followed by a delimiter bit with the opposite value) 364 // | means that there is 1 byte after the first byte, 2 total 365 // | 366 // sign bit: 0 means that the number is negative 367 // 368 // 369 // Compared with the simpler unsigned format used for uint64 numbers, 370 // this format is more compact for small numbers, namely one byte encodes 371 // numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc. 372 // In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)). 373 // (The cross-over point for compactness of representation is 8 bytes, 374 // where this format only covers the range [-2^55,2^55), 375 // whereas an encoding with sign bit and length in the first byte and 376 // payload in all following bytes would cover [-2^56,2^56).) 377 378 static const int kMaxSigned64Length = 10; 379 380 // This array maps encoding length to header bits in the first two bytes. 381 static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = { 382 {0, 0}, {'\x80', 0}, {'\xc0', 0}, {'\xe0', 0}, 383 {'\xf0', 0}, {'\xf8', 0}, {'\xfc', 0}, {'\xfe', 0}, 384 {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}}; 385 386 // This array maps encoding lengths to the header bits that overlap with 387 // the payload and need fixing when reading. 388 static const uint64 kLengthToMask[1 + kMaxSigned64Length] = { 389 0ULL, 390 0x80ULL, 391 0xc000ULL, 392 0xe00000ULL, 393 0xf0000000ULL, 394 0xf800000000ULL, 395 0xfc0000000000ULL, 396 0xfe000000000000ULL, 397 0xff00000000000000ULL, 398 0x8000000000000000ULL, 399 0ULL}; 400 401 // This array maps the number of bits in a number to the encoding 402 // length produced by WriteSignedNumIncreasing. 403 // For positive numbers, the number of bits is 1 plus the most significant 404 // bit position (the highest bit position in a positive int64 is 63). 405 // For a negative number n, we count the bits in ~n. 406 // That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1]. 407 static const int8 kBitsToLength[1 + 63] = { 408 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 409 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 410 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10}; 411 412 #if defined(__GNUC__) 413 // Returns floor(lg(n)). Returns -1 if n == 0. 414 static int Log2Floor64(uint64 n) { 415 return n == 0 ? -1 : 63 ^ __builtin_clzll(n); 416 } 417 #else 418 // Portable slow version 419 static int Log2Floor32_Portable(uint32 n) { 420 if (n == 0) return -1; 421 int log = 0; 422 uint32 value = n; 423 for (int i = 4; i >= 0; --i) { 424 int shift = (1 << i); 425 uint32 x = value >> shift; 426 if (x != 0) { 427 value = x; 428 log += shift; 429 } 430 } 431 assert(value == 1); 432 return log; 433 } 434 // Returns floor(lg(n)). Returns -1 if n == 0. 435 static int Log2Floor64(uint64 n) { 436 const uint32 topbits = static_cast<uint32>(n >> 32); 437 if (topbits == 0) { 438 // Top bits are zero, so scan in bottom bits 439 return Log2Floor32_Portable(static_cast<uint32>(n)); 440 } else { 441 return 32 + Log2Floor32_Portable(topbits); 442 } 443 } 444 #endif 445 446 // Calculates the encoding length in bytes of the signed number n. 447 static inline int SignedEncodingLength(int64 n) { 448 return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1]; 449 } 450 451 static void StoreBigEndian64(char* dst, uint64 v) { 452 for (int i = 0; i < 8; i++) { 453 dst[i] = (v >> (56 - 8 * i)) & 0xff; 454 } 455 } 456 457 static uint64 LoadBigEndian64(const char* src) { 458 uint64 result = 0; 459 for (int i = 0; i < 8; i++) { 460 unsigned char c = static_cast<unsigned char>(src[i]); 461 result |= static_cast<uint64>(c) << (56 - 8 * i); 462 } 463 return result; 464 } 465 466 void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) { 467 const uint64 x = val < 0 ? ~val : val; 468 if (x < 64) { // fast path for encoding length == 1 469 *dest += kLengthToHeaderBits[1][0] ^ val; 470 return; 471 } 472 // buf = val in network byte order, sign extended to 10 bytes 473 const char sign_byte = val < 0 ? '\xff' : '\0'; 474 char buf[10] = { 475 sign_byte, 476 sign_byte, 477 }; 478 StoreBigEndian64(buf + 2, val); 479 static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch"); 480 const int len = SignedEncodingLength(x); 481 DCHECK_GE(len, 2); 482 char* const begin = buf + sizeof(buf) - len; 483 begin[0] ^= kLengthToHeaderBits[len][0]; 484 begin[1] ^= kLengthToHeaderBits[len][1]; // ok because len >= 2 485 dest->append(begin, len); 486 } 487 488 bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64* result) { 489 if (src->empty()) return false; 490 const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL; 491 const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff); 492 493 // now calculate and test length, and set x to raw (unmasked) result 494 int len; 495 uint64 x; 496 if (first_byte != 0xff) { 497 len = 7 - Log2Floor64(first_byte ^ 0xff); 498 if (src->size() < static_cast<size_t>(len)) return false; 499 x = xor_mask; // sign extend using xor_mask 500 for (int i = 0; i < len; ++i) 501 x = (x << 8) | static_cast<unsigned char>((*src)[i]); 502 } else { 503 len = 8; 504 if (src->size() < static_cast<size_t>(len)) return false; 505 const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff); 506 if (second_byte >= 0x80) { 507 if (second_byte < 0xc0) { 508 len = 9; 509 } else { 510 const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff); 511 if (second_byte == 0xc0 && third_byte < 0x80) { 512 len = 10; 513 } else { 514 return false; // either len > 10 or len == 10 and #bits > 63 515 } 516 } 517 if (src->size() < static_cast<size_t>(len)) return false; 518 } 519 x = LoadBigEndian64(src->data() + len - 8); 520 } 521 522 x ^= kLengthToMask[len]; // remove spurious header bits 523 524 DCHECK_EQ(len, SignedEncodingLength(x)) << "invalid encoding"; 525 526 if (result) *result = x; 527 src->remove_prefix(len); 528 return true; 529 } 530 531 } // namespace strings 532 } // namespace tensorflow 533