1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "sync/internal_api/public/base/unique_position.h" 6 7 #include "base/basictypes.h" 8 #include "base/logging.h" 9 #include "base/rand_util.h" 10 #include "base/stl_util.h" 11 #include "base/strings/string_number_conversions.h" 12 #include "sync/protocol/unique_position.pb.h" 13 #include "third_party/zlib/zlib.h" 14 15 namespace syncer { 16 17 const size_t UniquePosition::kSuffixLength = 28; 18 const size_t UniquePosition::kCompressBytesThreshold = 128; 19 20 // static. 21 bool UniquePosition::IsValidSuffix(const std::string& suffix) { 22 // The suffix must be exactly the specified length, otherwise unique suffixes 23 // are not sufficient to guarantee unique positions (because prefix + suffix 24 // == p + refixsuffix). 25 return suffix.length() == kSuffixLength 26 && suffix[kSuffixLength-1] != 0; 27 } 28 29 // static. 30 bool UniquePosition::IsValidBytes(const std::string& bytes) { 31 // The first condition ensures that our suffix uniqueness is sufficient to 32 // guarantee position uniqueness. Otherwise, it's possible the end of some 33 // prefix + some short suffix == some long suffix. 34 // The second condition ensures that FindSmallerWithSuffix can always return a 35 // result. 36 return bytes.length() >= kSuffixLength 37 && bytes[bytes.length()-1] != 0; 38 } 39 40 // static. 41 std::string UniquePosition::RandomSuffix() { 42 // Users random data for all but the last byte. The last byte must not be 43 // zero. We arbitrarily set it to 0x7f. 44 return base::RandBytesAsString(kSuffixLength - 1) + "\x7f"; 45 } 46 47 // static. 48 UniquePosition UniquePosition::CreateInvalid() { 49 UniquePosition pos; 50 DCHECK(!pos.IsValid()); 51 return pos; 52 } 53 54 // static. 55 UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { 56 if (proto.has_custom_compressed_v1()) { 57 return UniquePosition(proto.custom_compressed_v1()); 58 } else if (proto.has_value() && !proto.value().empty()) { 59 return UniquePosition(Compress(proto.value())); 60 } else if (proto.has_compressed_value() && proto.has_uncompressed_length()) { 61 uLongf uncompressed_len = proto.uncompressed_length(); 62 std::string un_gzipped; 63 64 un_gzipped.resize(uncompressed_len); 65 int result = uncompress( 66 reinterpret_cast<Bytef*>(string_as_array(&un_gzipped)), 67 &uncompressed_len, 68 reinterpret_cast<const Bytef*>(proto.compressed_value().data()), 69 proto.compressed_value().size()); 70 if (result != Z_OK) { 71 DLOG(ERROR) << "Unzip failed " << result; 72 return UniquePosition::CreateInvalid(); 73 } 74 if (uncompressed_len != proto.uncompressed_length()) { 75 DLOG(ERROR) 76 << "Uncompressed length " << uncompressed_len 77 << " did not match specified length " << proto.uncompressed_length(); 78 return UniquePosition::CreateInvalid(); 79 } 80 return UniquePosition(Compress(un_gzipped)); 81 } else { 82 return UniquePosition::CreateInvalid(); 83 } 84 } 85 86 // static. 87 UniquePosition UniquePosition::FromInt64( 88 int64 x, const std::string& suffix) { 89 uint64 y = static_cast<uint64>(x); 90 y ^= 0x8000000000000000ULL; // Make it non-negative. 91 std::string bytes(8, 0); 92 for (int i = 7; i >= 0; --i) { 93 bytes[i] = static_cast<uint8>(y); 94 y >>= 8; 95 } 96 return UniquePosition(bytes + suffix, suffix); 97 } 98 99 // static. 100 UniquePosition UniquePosition::InitialPosition( 101 const std::string& suffix) { 102 DCHECK(IsValidSuffix(suffix)); 103 return UniquePosition(suffix, suffix); 104 } 105 106 // static. 107 UniquePosition UniquePosition::Before( 108 const UniquePosition& x, 109 const std::string& suffix) { 110 DCHECK(IsValidSuffix(suffix)); 111 DCHECK(x.IsValid()); 112 const std::string& before = FindSmallerWithSuffix( 113 Uncompress(x.compressed_), suffix); 114 return UniquePosition(before + suffix, suffix); 115 } 116 117 // static. 118 UniquePosition UniquePosition::After( 119 const UniquePosition& x, 120 const std::string& suffix) { 121 DCHECK(IsValidSuffix(suffix)); 122 DCHECK(x.IsValid()); 123 const std::string& after = FindGreaterWithSuffix( 124 Uncompress(x.compressed_), suffix); 125 return UniquePosition(after + suffix, suffix); 126 } 127 128 // static. 129 UniquePosition UniquePosition::Between( 130 const UniquePosition& before, 131 const UniquePosition& after, 132 const std::string& suffix) { 133 DCHECK(before.IsValid()); 134 DCHECK(after.IsValid()); 135 DCHECK(before.LessThan(after)); 136 DCHECK(IsValidSuffix(suffix)); 137 const std::string& mid = FindBetweenWithSuffix( 138 Uncompress(before.compressed_), 139 Uncompress(after.compressed_), 140 suffix); 141 return UniquePosition(mid + suffix, suffix); 142 } 143 144 UniquePosition::UniquePosition() : is_valid_(false) {} 145 146 bool UniquePosition::LessThan(const UniquePosition& other) const { 147 DCHECK(this->IsValid()); 148 DCHECK(other.IsValid()); 149 150 return compressed_ < other.compressed_; 151 } 152 153 bool UniquePosition::Equals(const UniquePosition& other) const { 154 if (!this->IsValid() && !other.IsValid()) 155 return true; 156 157 return compressed_ == other.compressed_; 158 } 159 160 void UniquePosition::ToProto(sync_pb::UniquePosition* proto) const { 161 proto->Clear(); 162 163 // This is the current preferred foramt. 164 proto->set_custom_compressed_v1(compressed_); 165 166 // Older clients used to write other formats. We don't bother doing that 167 // anymore because that form of backwards compatibility is expensive. We no 168 // longer want to pay that price just too support clients that have been 169 // obsolete for a long time. See the proto definition for details. 170 } 171 172 void UniquePosition::SerializeToString(std::string* blob) const { 173 DCHECK(blob); 174 sync_pb::UniquePosition proto; 175 ToProto(&proto); 176 proto.SerializeToString(blob); 177 } 178 179 int64 UniquePosition::ToInt64() const { 180 uint64 y = 0; 181 const std::string& s = Uncompress(compressed_); 182 size_t l = sizeof(int64); 183 if (s.length() < l) { 184 NOTREACHED(); 185 l = s.length(); 186 } 187 for (size_t i = 0; i < l; ++i) { 188 const uint8 byte = s[l - i - 1]; 189 y |= static_cast<uint64>(byte) << (i * 8); 190 } 191 y ^= 0x8000000000000000ULL; 192 // This is technically implementation-defined if y > INT64_MAX, so 193 // we're assuming that we're on a twos-complement machine. 194 return static_cast<int64>(y); 195 } 196 197 bool UniquePosition::IsValid() const { 198 return is_valid_; 199 } 200 201 std::string UniquePosition::ToDebugString() const { 202 const std::string bytes = Uncompress(compressed_); 203 if (bytes.empty()) 204 return std::string("INVALID[]"); 205 206 std::string debug_string = base::HexEncode(bytes.data(), bytes.length()); 207 if (!IsValid()) { 208 debug_string = "INVALID[" + debug_string + "]"; 209 } 210 211 std::string compressed_string = 212 base::HexEncode(compressed_.data(), compressed_.length()); 213 debug_string.append(", compressed: " + compressed_string); 214 return debug_string; 215 } 216 217 std::string UniquePosition::GetSuffixForTest() const { 218 const std::string bytes = Uncompress(compressed_); 219 const size_t prefix_len = bytes.length() - kSuffixLength; 220 return bytes.substr(prefix_len, std::string::npos); 221 } 222 223 std::string UniquePosition::FindSmallerWithSuffix( 224 const std::string& reference, 225 const std::string& suffix) { 226 size_t ref_zeroes = reference.find_first_not_of('\0'); 227 size_t suffix_zeroes = suffix.find_first_not_of('\0'); 228 229 // Neither of our inputs are allowed to have trailing zeroes, so the following 230 // must be true. 231 DCHECK_NE(ref_zeroes, std::string::npos); 232 DCHECK_NE(suffix_zeroes, std::string::npos); 233 234 if (suffix_zeroes > ref_zeroes) { 235 // Implies suffix < ref. 236 return std::string(); 237 } 238 239 if (suffix.substr(suffix_zeroes) < reference.substr(ref_zeroes)) { 240 // Prepend zeroes so the result has as many zero digits as |reference|. 241 return std::string(ref_zeroes - suffix_zeroes, '\0'); 242 } else if (suffix_zeroes > 1) { 243 // Prepend zeroes so the result has one more zero digit than |reference|. 244 // We could also take the "else" branch below, but taking this branch will 245 // give us a smaller result. 246 return std::string(ref_zeroes - suffix_zeroes + 1, '\0'); 247 } else { 248 // Prepend zeroes to match those in the |reference|, then something smaller 249 // than the first non-zero digit in |reference|. 250 char lt_digit = static_cast<uint8>(reference[ref_zeroes])/2; 251 return std::string(ref_zeroes, '\0') + lt_digit; 252 } 253 } 254 255 // static 256 std::string UniquePosition::FindGreaterWithSuffix( 257 const std::string& reference, 258 const std::string& suffix) { 259 size_t ref_FFs = reference.find_first_not_of(kuint8max); 260 size_t suffix_FFs = suffix.find_first_not_of(kuint8max); 261 262 if (ref_FFs == std::string::npos) { 263 ref_FFs = reference.length(); 264 } 265 if (suffix_FFs == std::string::npos) { 266 suffix_FFs = suffix.length(); 267 } 268 269 if (suffix_FFs > ref_FFs) { 270 // Implies suffix > reference. 271 return std::string(); 272 } 273 274 if (suffix.substr(suffix_FFs) > reference.substr(ref_FFs)) { 275 // Prepend FF digits to match those in |reference|. 276 return std::string(ref_FFs - suffix_FFs, kuint8max); 277 } else if (suffix_FFs > 1) { 278 // Prepend enough leading FF digits so result has one more of them than 279 // |reference| does. We could also take the "else" branch below, but this 280 // gives us a smaller result. 281 return std::string(ref_FFs - suffix_FFs + 1, kuint8max); 282 } else { 283 // Prepend FF digits to match those in |reference|, then something larger 284 // than the first non-FF digit in |reference|. 285 char gt_digit = static_cast<uint8>(reference[ref_FFs]) + 286 (kuint8max - static_cast<uint8>(reference[ref_FFs]) + 1) / 2; 287 return std::string(ref_FFs, kuint8max) + gt_digit; 288 } 289 } 290 291 // static 292 std::string UniquePosition::FindBetweenWithSuffix( 293 const std::string& before, 294 const std::string& after, 295 const std::string& suffix) { 296 DCHECK(IsValidSuffix(suffix)); 297 DCHECK_NE(before, after); 298 DCHECK_LT(before, after); 299 300 std::string mid; 301 302 // Sometimes our suffix puts us where we want to be. 303 if (before < suffix && suffix < after) { 304 return std::string(); 305 } 306 307 size_t i = 0; 308 for ( ; i < std::min(before.length(), after.length()); ++i) { 309 uint8 a_digit = before[i]; 310 uint8 b_digit = after[i]; 311 312 if (b_digit - a_digit >= 2) { 313 mid.push_back(a_digit + (b_digit - a_digit)/2); 314 return mid; 315 } else if (a_digit == b_digit) { 316 mid.push_back(a_digit); 317 318 // Both strings are equal so far. Will appending the suffix at this point 319 // give us the comparison we're looking for? 320 if (before.substr(i+1) < suffix && suffix < after.substr(i+1)) { 321 return mid; 322 } 323 } else { 324 DCHECK_EQ(b_digit - a_digit, 1); // Implied by above if branches. 325 326 // The two options are off by one digit. The choice of whether to round 327 // up or down here will have consequences on what we do with the remaining 328 // digits. Exploring both options is an optimization and is not required 329 // for the correctness of this algorithm. 330 331 // Option A: Round down the current digit. This makes our |mid| < 332 // |after|, no matter what we append afterwards. We then focus on 333 // appending digits until |mid| > |before|. 334 std::string mid_a = mid; 335 mid_a.push_back(a_digit); 336 mid_a.append(FindGreaterWithSuffix(before.substr(i+1), suffix)); 337 338 // Option B: Round up the current digit. This makes our |mid| > |before|, 339 // no matter what we append afterwards. We then focus on appending digits 340 // until |mid| < |after|. Note that this option may not be viable if the 341 // current digit is the last one in |after|, so we skip the option in that 342 // case. 343 if (after.length() > i+1) { 344 std::string mid_b = mid; 345 mid_b.push_back(b_digit); 346 mid_b.append(FindSmallerWithSuffix(after.substr(i+1), suffix)); 347 348 // Does this give us a shorter position value? If so, use it. 349 if (mid_b.length() < mid_a.length()) { 350 return mid_b; 351 } 352 } 353 return mid_a; 354 } 355 } 356 357 // If we haven't found a midpoint yet, the following must be true: 358 DCHECK_EQ(before.substr(0, i), after.substr(0, i)); 359 DCHECK_EQ(before, mid); 360 DCHECK_LT(before.length(), after.length()); 361 362 // We know that we'll need to append at least one more byte to |mid| in the 363 // process of making it < |after|. Appending any digit, regardless of the 364 // value, will make |before| < |mid|. Therefore, the following will get us a 365 // valid position. 366 367 mid.append(FindSmallerWithSuffix(after.substr(i), suffix)); 368 return mid; 369 } 370 371 UniquePosition::UniquePosition(const std::string& internal_rep) 372 : compressed_(internal_rep), 373 is_valid_(IsValidBytes(Uncompress(internal_rep))) { 374 } 375 376 UniquePosition::UniquePosition( 377 const std::string& uncompressed, 378 const std::string& suffix) 379 : compressed_(Compress(uncompressed)), 380 is_valid_(IsValidBytes(uncompressed)) { 381 DCHECK(uncompressed.rfind(suffix) + kSuffixLength == uncompressed.length()); 382 DCHECK(IsValidSuffix(suffix)); 383 DCHECK(IsValid()); 384 } 385 386 // On custom compression: 387 // 388 // Let C(x) be the compression function and U(x) be the uncompression function. 389 // 390 // This compression scheme has a few special properties. For one, it is 391 // order-preserving. For any two valid position strings x and y: 392 // x < y <=> C(x) < C(y) 393 // This allows us keep the position strings compressed as we sort them. 394 // 395 // The compressed format and the decode algorithm: 396 // 397 // The compressed string is a series of blocks, almost all of which are 8 bytes 398 // in length. The only exception is the last block in the compressed string, 399 // which may be a remainder block, which has length no greater than 7. The 400 // full-length blocks are either repeated character blocks or plain data blocks. 401 // All blocks are entirely self-contained. Their decoded values are independent 402 // from that of their neighbours. 403 // 404 // A repeated character block is encoded into eight bytes and represents between 405 // 4 and 2^31 repeated instances of a given character in the unencoded stream. 406 // The encoding consists of a single character repeated four times, followed by 407 // an encoded count. The encoded count is stored as a big-endian 32 bit 408 // integer. There are 2^31 possible count values, and two encodings for each. 409 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc = 410 // count'. At compression time, the algorithm will choose between the two 411 // encodings based on which of the two will maintain the appropriate sort 412 // ordering (by a process which will be described below). The decompression 413 // algorithm need not concern itself with which encoding was used; it needs only 414 // to decode it. The decoded value of this block is "count" instances of the 415 // character that was repeated four times in the first half of this block. 416 // 417 // A plain data block is encoded into eight bytes and represents exactly eight 418 // bytes of data in the unencoded stream. The plain data block must not begin 419 // with the same character repeated four times. It is allowed to contain such a 420 // four-character sequence, just not at the start of the block. The decoded 421 // value of a plain data block is identical to its encoded value. 422 // 423 // A remainder block has length of at most seven. It is a shorter version of 424 // the plain data block. It occurs only at the end of the encoded stream and 425 // represents exactly as many bytes of unencoded data as its own length. Like a 426 // plain data block, the remainder block never begins with the same character 427 // repeated four times. The decoded value of this block is identical to its 428 // encoded value. 429 // 430 // The encode algorithm: 431 // 432 // From the above description, it can be seen that there may be more than one 433 // way to encode a given input string. The encoder must be careful to choose 434 // the encoding that guarantees sort ordering. 435 // 436 // The rules for the encoder are as follows: 437 // 1. Iterate through the input string and produce output blocks one at a time. 438 // 2. Where possible (ie. where the next four bytes of input consist of the 439 // same character repeated four times), produce a repeated data block of 440 // maximum possible length. 441 // 3. If there is at least 8 bytes of data remaining and it is not possible 442 // to produce a repeated character block, produce a plain data block. 443 // 4. If there are less than 8 bytes of data remaining and it is not possible 444 // to produce a repeated character block, produce a remainder block. 445 // 5. When producing a repeated character block, the count encoding must be 446 // chosen in such a way that the sort ordering is maintained. The choice is 447 // best illustrated by way of example: 448 // 449 // When comparing two strings, the first of which begins with of 8 450 // instances of the letter 'B' and the second with 10 instances of the 451 // letter 'B', which of the two should compare lower? The result depends 452 // on the 9th character of the first string, since it will be compared 453 // against the 9th 'B' in the second string. If that character is an 'A', 454 // then the first string will compare lower. If it is a 'C', then the 455 // first string will compare higher. 456 // 457 // The key insight is that the comparison value of a repeated character block 458 // depends on the value of the character that follows it. If the character 459 // follows the repeated character has a value greater than the repeated 460 // character itself, then a shorter run length should translate to a higher 461 // comparison value. Therefore, we encode its count using the low encoding. 462 // Similarly, if the following character is lower, we use the high encoding. 463 464 namespace { 465 466 // Appends an encoded run length to |output_str|. 467 static void WriteEncodedRunLength(uint32 length, 468 bool high_encoding, 469 std::string* output_str) { 470 CHECK_GE(length, 4U); 471 CHECK_LT(length, 0x80000000); 472 473 // Step 1: Invert the count, if necessary, to account for the following digit. 474 uint32 encoded_length; 475 if (high_encoding) { 476 encoded_length = 0xffffffff - length; 477 } else { 478 encoded_length = length; 479 } 480 481 // Step 2: Write it as big-endian so it compares correctly with memcmp(3). 482 output_str->append(1, 0xff & (encoded_length >> 24U)); 483 output_str->append(1, 0xff & (encoded_length >> 16U)); 484 output_str->append(1, 0xff & (encoded_length >> 8U)); 485 output_str->append(1, 0xff & (encoded_length >> 0U)); 486 } 487 488 // Reads an encoded run length for |str| at position |i|. 489 static uint32 ReadEncodedRunLength(const std::string& str, size_t i) { 490 DCHECK_LE(i + 4, str.length()); 491 492 // Step 1: Extract the big-endian count. 493 uint32 encoded_length = 494 ((uint8)(str[i+3]) << 0) | 495 ((uint8)(str[i+2]) << 8) | 496 ((uint8)(str[i+1]) << 16) | 497 ((uint8)(str[i+0]) << 24); 498 499 // Step 2: If this was an inverted count, un-invert it. 500 uint32 length; 501 if (encoded_length & 0x80000000) { 502 length = 0xffffffff - encoded_length; 503 } else { 504 length = encoded_length; 505 } 506 507 return length; 508 } 509 510 // A series of four identical chars at the beginning of a block indicates 511 // the beginning of a repeated character block. 512 static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { 513 return chars[start_index] == chars[start_index+1] 514 && chars[start_index] == chars[start_index+2] 515 && chars[start_index] == chars[start_index+3]; 516 } 517 518 } // namespace 519 520 // static 521 // Wraps the CompressImpl function with a bunch of DCHECKs. 522 std::string UniquePosition::Compress(const std::string& str) { 523 DCHECK(IsValidBytes(str)); 524 std::string compressed = CompressImpl(str); 525 DCHECK(IsValidCompressed(compressed)); 526 DCHECK_EQ(str, Uncompress(compressed)); 527 return compressed; 528 } 529 530 // static 531 // Performs the order preserving run length compression of a given input string. 532 std::string UniquePosition::CompressImpl(const std::string& str) { 533 std::string output; 534 535 // The compressed length will usually be at least as long as the suffix (28), 536 // since the suffix bytes are mostly random. Most are a few bytes longer; a 537 // small few are tens of bytes longer. Some early tests indicated that 538 // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a 539 // good trade-off, but that has not been confirmed through profiling. 540 output.reserve(48); 541 542 // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the 543 // length of a string of identical digits starting at i. 544 for (size_t i = 0; i < str.length(); ) { 545 if (i + 4 <= str.length() && IsRepeatedCharPrefix(str, i)) { 546 // Four identical bytes in a row at this position means that we must start 547 // a repeated character block. Begin by outputting those four bytes. 548 output.append(str, i, 4); 549 550 // Determine the size of the run. 551 const char rep_digit = str[i]; 552 const size_t runs_until = str.find_first_not_of(rep_digit, i+4); 553 554 // Handle the 'runs until end' special case specially. 555 size_t run_length; 556 bool encode_high; // True if the next byte is greater than |rep_digit|. 557 if (runs_until == std::string::npos) { 558 run_length = str.length() - i; 559 encode_high = false; 560 } else { 561 run_length = runs_until - i; 562 encode_high = static_cast<uint8>(str[runs_until]) > 563 static_cast<uint8>(rep_digit); 564 } 565 DCHECK_LT(run_length, static_cast<size_t>(kint32max)) 566 << "This implementation can't encode run-lengths greater than 2^31."; 567 568 WriteEncodedRunLength(run_length, encode_high, &output); 569 i += run_length; // Jump forward by the size of the run length. 570 } else { 571 // Output up to eight bytes without any encoding. 572 const size_t len = std::min(static_cast<size_t>(8), str.length() - i); 573 output.append(str, i, len); 574 i += len; // Jump forward by the amount of input consumed (usually 8). 575 } 576 } 577 578 return output; 579 } 580 581 // static 582 // Uncompresses strings that were compresed with UniquePosition::Compress. 583 std::string UniquePosition::Uncompress(const std::string& str) { 584 std::string output; 585 size_t i = 0; 586 // Iterate through the compressed string one block at a time. 587 for (i = 0; i + 8 <= str.length(); i += 8) { 588 if (IsRepeatedCharPrefix(str, i)) { 589 // Found a repeated character block. Expand it. 590 const char rep_digit = str[i]; 591 uint32 length = ReadEncodedRunLength(str, i+4); 592 output.append(length, rep_digit); 593 } else { 594 // Found a regular block. Copy it. 595 output.append(str, i, 8); 596 } 597 } 598 // Copy the remaining bytes that were too small to form a block. 599 output.append(str, i, std::string::npos); 600 return output; 601 } 602 603 bool UniquePosition::IsValidCompressed(const std::string& str) { 604 for (size_t i = 0; i + 8 <= str.length(); i += 8) { 605 if (IsRepeatedCharPrefix(str, i)) { 606 uint32 count = ReadEncodedRunLength(str, i+4); 607 if (count < 4) { 608 // A repeated character block should at least represent the four 609 // characters that started it. 610 return false; 611 } 612 if (str[i] == str[i+4]) { 613 // Does the next digit after a count match the repeated character? Then 614 // this is not the highest possible count. 615 return false; 616 } 617 } 618 } 619 // We don't bother looking for the existence or checking the validity of 620 // any partial blocks. There's no way they could be invalid anyway. 621 return true; 622 } 623 624 } // namespace syncer 625