Home | History | Annotate | Download | only in strings
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/lib/strings/ordered_code.h"
     17 
     18 #include <assert.h>
     19 #include <stddef.h>
     20 
     21 #include "tensorflow/core/lib/core/stringpiece.h"
     22 #include "tensorflow/core/platform/logging.h"
     23 
     24 namespace tensorflow {
     25 namespace strings {
     26 
     27 // We encode a string in different ways depending on whether the item
     28 // should be in lexicographically increasing or decreasing order.
     29 //
     30 //
     31 // Lexicographically increasing order
     32 //
     33 // We want a string-to-string mapping F(x) such that for any two strings
     34 //
     35 //      x < y   =>   F(x) < F(y)
     36 //
     37 // In addition to the normal characters '\x00' through '\xff', we want to
     38 // encode a few extra symbols in strings:
     39 //
     40 //      <sep>           Separator between items
     41 //      <infinity>      Infinite string
     42 //
     43 // Therefore we need an alphabet with at least 258 symbols.  Each
     44 // character '\1' through '\xfe' is mapped to itself.  The other four are
     45 // encoded into two-letter sequences starting with '\0' and '\xff':
     46 //
     47 //      <sep>           encoded as =>           \0\1
     48 //      \0              encoded as =>           \0\xff
     49 //      \xff            encoded as =>           \xff\x00
     50 //      <infinity>      encoded as =>           \xff\xff
     51 //
     52 // The remaining two-letter sequences starting with '\0' and '\xff' are
     53 // currently unused.
     54 //
     55 // F(<infinity>) is defined above.  For any finite string x, F(x) is the
     56 // the encodings of x's characters followed by the encoding for <sep>.  The
     57 // ordering of two finite strings is the same as the ordering of the
     58 // respective characters at the first position where they differ, which in
     59 // turn is the same as the ordering of the encodings of those two
     60 // characters.  Moreover, for every finite string x, F(x) < F(<infinity>).
     61 //
     62 //
     63 // Lexicographically decreasing order
     64 //
     65 // We want a string-to-string mapping G(x) such that for any two strings,
     66 // whether finite or not,
     67 //
     68 //      x < y   =>   G(x) > G(y)
     69 //
     70 // To achieve this, define G(x) to be the inversion of F(x): I(F(x)).  In
     71 // other words, invert every bit in F(x) to get G(x). For example,
     72 //
     73 //        x  = \x00\x13\xff
     74 //      F(x) = \x00\xff\x13\xff\x00\x00\x01  escape \0, \xff, append F(<sep>)
     75 //      G(x) = \xff\x00\xec\x00\xff\xff\xfe  invert every bit in F(x)
     76 //
     77 //        x  = <infinity>
     78 //      F(x) = \xff\xff
     79 //      G(x) = \x00\x00
     80 //
     81 // Another example is
     82 //
     83 //        x            F(x)        G(x) = I(F(x))
     84 //        -            ----        --------------
     85 //        <infinity>   \xff\xff    \x00\x00
     86 //        "foo"        foo\0\1     \x99\x90\x90\xff\xfe
     87 //        "aaa"        aaa\0\1     \x9e\x9e\x9e\xff\xfe
     88 //        "aa"         aa\0\1      \x9e\x9e\xff\xfe
     89 //        ""           \0\1        \xff\xfe
     90 //
     91 // More generally and rigorously, if for any two strings x and y
     92 //
     93 //      F(x) < F(y)   =>   I(F(x)) > I(F(y))                      (1)
     94 //
     95 // it would follow that x < y => G(x) > G(y) because
     96 //
     97 //      x < y   =>   F(x) < F(y)   =>   G(x) = I(F(x)) > I(F(y)) = G(y)
     98 //
     99 // We now show why (1) is true, in two parts.  Notice that for any two
    100 // strings x < y, F(x) is *not* a proper prefix of F(y).  Suppose x is a
    101 // proper prefix of y (say, x="abc" < y="abcd").  F(x) and F(y) diverge at
    102 // the F(<sep>) in F(x) (v. F('d') in the example).  Suppose x is not a
    103 // proper prefix of y (say, x="abce" < y="abd"), F(x) and F(y) diverge at
    104 // their respective encodings of the characters where x and y diverge
    105 // (F('c') v. F('d')).  Finally, if y=<infinity>, we can see that
    106 // F(y)=\xff\xff is not the prefix of F(x) for any finite string x, simply
    107 // by considering all the possible first characters of F(x).
    108 //
    109 // Given that F(x) is not a proper prefix F(y), the order of F(x) and F(y)
    110 // is determined by the byte where F(x) and F(y) diverge.  For example, the
    111 // order of F(x)="eefh" and F(y)="eeg" is determined by their third
    112 // characters.  I(p) inverts each byte in p, which effectively subtracts
    113 // each byte from 0xff.  So, in this example, I('f') > I('g'), and thus
    114 // I(F(x)) > I(F(y)).
    115 //
    116 //
    117 // Implementation
    118 //
    119 // To implement G(x) efficiently, we use C++ template to instantiate two
    120 // versions of the code to produce F(x), one for normal encoding (giving us
    121 // F(x)) and one for inverted encoding (giving us G(x) = I(F(x))).
    122 
    123 static const char kEscape1 = '\000';
    124 static const char kNullCharacter = '\xff';  // Combined with kEscape1
    125 static const char kSeparator = '\001';      // Combined with kEscape1
    126 
    127 static const char kEscape2 = '\xff';
    128 static const char kFFCharacter = '\000';  // Combined with kEscape2
    129 
    130 static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
    131 
    132 // Append to "*dest" the "len" bytes starting from "*src".
    133 inline static void AppendBytes(string* dest, const char* src, size_t len) {
    134   dest->append(src, len);
    135 }
    136 
    137 inline bool IsSpecialByte(char c) {
    138   return (static_cast<unsigned char>(c + 1)) < 2;
    139 }
    140 
    141 // Return a pointer to the first byte in the range "[start..limit)"
    142 // whose value is 0 or 255 (kEscape1 or kEscape2).  If no such byte
    143 // exists in the range, returns "limit".
    144 inline const char* SkipToNextSpecialByte(const char* start, const char* limit) {
    145   // If these constants were ever changed, this routine needs to change
    146   DCHECK_EQ(kEscape1, 0);
    147   DCHECK_EQ(kEscape2 & 0xffu, 255u);
    148   const char* p = start;
    149   while (p < limit && !IsSpecialByte(*p)) {
    150     p++;
    151   }
    152   return p;
    153 }
    154 
    155 // Expose SkipToNextSpecialByte for testing purposes
    156 const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
    157                                                     const char* limit) {
    158   return SkipToNextSpecialByte(start, limit);
    159 }
    160 
    161 // Helper routine to encode "s" and append to "*dest", escaping special
    162 // characters.
    163 inline static void EncodeStringFragment(string* dest, StringPiece s) {
    164   const char* p = s.data();
    165   const char* limit = p + s.size();
    166   const char* copy_start = p;
    167   while (true) {
    168     p = SkipToNextSpecialByte(p, limit);
    169     if (p >= limit) break;  // No more special characters that need escaping
    170     char c = *(p++);
    171     DCHECK(IsSpecialByte(c));
    172     if (c == kEscape1) {
    173       AppendBytes(dest, copy_start, p - copy_start - 1);
    174       dest->push_back(kEscape1);
    175       dest->push_back(kNullCharacter);
    176       copy_start = p;
    177     } else {
    178       assert(c == kEscape2);
    179       AppendBytes(dest, copy_start, p - copy_start - 1);
    180       dest->push_back(kEscape2);
    181       dest->push_back(kFFCharacter);
    182       copy_start = p;
    183     }
    184   }
    185   if (p > copy_start) {
    186     AppendBytes(dest, copy_start, p - copy_start);
    187   }
    188 }
    189 
    190 void OrderedCode::WriteString(string* dest, StringPiece s) {
    191   EncodeStringFragment(dest, s);
    192   AppendBytes(dest, kEscape1_Separator, 2);
    193 }
    194 
    195 void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
    196   // Values are encoded with a single byte length prefix, followed
    197   // by the actual value in big-endian format with leading 0 bytes
    198   // dropped.
    199   unsigned char buf[9];  // 8 bytes for value plus one byte for length
    200   int len = 0;
    201   while (val > 0) {
    202     len++;
    203     buf[9 - len] = (val & 0xff);
    204     val >>= 8;
    205   }
    206   buf[9 - len - 1] = len;
    207   len++;
    208   AppendBytes(dest, reinterpret_cast<const char*>(buf + 9 - len), len);
    209 }
    210 
    211 // Parse the encoding of a previously encoded string.
    212 // If parse succeeds, return true, consume encoding from
    213 // "*src", and if result != NULL append the decoded string to "*result".
    214 // Otherwise, return false and leave both undefined.
    215 inline static bool ReadStringInternal(StringPiece* src, string* result) {
    216   const char* start = src->data();
    217   const char* string_limit = src->data() + src->size();
    218 
    219   // We only scan up to "limit-2" since a valid string must end with
    220   // a two character terminator: 'kEscape1 kSeparator'
    221   const char* limit = string_limit - 1;
    222   const char* copy_start = start;
    223   while (true) {
    224     start = SkipToNextSpecialByte(start, limit);
    225     if (start >= limit) break;  // No terminator sequence found
    226     const char c = *(start++);
    227     // If inversion is required, instead of inverting 'c', we invert the
    228     // character constants to which 'c' is compared.  We get the same
    229     // behavior but save the runtime cost of inverting 'c'.
    230     DCHECK(IsSpecialByte(c));
    231     if (c == kEscape1) {
    232       if (result) {
    233         AppendBytes(result, copy_start, start - copy_start - 1);
    234       }
    235       // kEscape1 kSeparator ends component
    236       // kEscape1 kNullCharacter represents '\0'
    237       const char next = *(start++);
    238       if (next == kSeparator) {
    239         src->remove_prefix(start - src->data());
    240         return true;
    241       } else if (next == kNullCharacter) {
    242         if (result) {
    243           *result += '\0';
    244         }
    245       } else {
    246         return false;
    247       }
    248       copy_start = start;
    249     } else {
    250       assert(c == kEscape2);
    251       if (result) {
    252         AppendBytes(result, copy_start, start - copy_start - 1);
    253       }
    254       // kEscape2 kFFCharacter represents '\xff'
    255       // kEscape2 kInfinity is an error
    256       const char next = *(start++);
    257       if (next == kFFCharacter) {
    258         if (result) {
    259           *result += '\xff';
    260         }
    261       } else {
    262         return false;
    263       }
    264       copy_start = start;
    265     }
    266   }
    267   return false;
    268 }
    269 
    270 bool OrderedCode::ReadString(StringPiece* src, string* result) {
    271   return ReadStringInternal(src, result);
    272 }
    273 
    274 bool OrderedCode::ReadNumIncreasing(StringPiece* src, uint64* result) {
    275   if (src->empty()) {
    276     return false;  // Not enough bytes
    277   }
    278 
    279   // Decode length byte
    280   const size_t len = static_cast<unsigned char>((*src)[0]);
    281 
    282   // If len > 0 and src is longer than 1, the first byte of "payload"
    283   // must be non-zero (otherwise the encoding is not minimal).
    284   // In opt mode, we don't enforce that encodings must be minimal.
    285   DCHECK(0 == len || src->size() == 1 || (*src)[1] != '\0')
    286       << "invalid encoding";
    287 
    288   if (len + 1 > src->size() || len > 8) {
    289     return false;  // Not enough bytes or too many bytes
    290   }
    291 
    292   if (result) {
    293     uint64 tmp = 0;
    294     for (size_t i = 0; i < len; i++) {
    295       tmp <<= 8;
    296       tmp |= static_cast<unsigned char>((*src)[1 + i]);
    297     }
    298     *result = tmp;
    299   }
    300   src->remove_prefix(len + 1);
    301   return true;
    302 }
    303 
    304 void OrderedCode::TEST_Corrupt(string* str, int k) {
    305   int seen_seps = 0;
    306   for (size_t i = 0; i + 1 < str->size(); i++) {
    307     if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
    308       seen_seps++;
    309       if (seen_seps == k) {
    310         (*str)[i + 1] = kSeparator + 1;
    311         return;
    312       }
    313     }
    314   }
    315 }
    316 
    317 // Signed number encoding/decoding /////////////////////////////////////
    318 //
    319 // The format is as follows:
    320 //
    321 // The first bit (the most significant bit of the first byte)
    322 // represents the sign, 0 if the number is negative and
    323 // 1 if the number is >= 0.
    324 //
    325 // Any unbroken sequence of successive bits with the same value as the sign
    326 // bit, up to 9 (the 8th and 9th are the most significant bits of the next
    327 // byte), are size bits that count the number of bytes after the first byte.
    328 // That is, the total length is between 1 and 10 bytes.
    329 //
    330 // The value occupies the bits after the sign bit and the "size bits"
    331 // till the end of the string, in network byte order.  If the number
    332 // is negative, the bits are in 2-complement.
    333 //
    334 //
    335 // Example 1: number 0x424242 -> 4 byte big-endian hex string 0xf0424242:
    336 //
    337 // +---------------+---------------+---------------+---------------+
    338 //  1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
    339 // +---------------+---------------+---------------+---------------+
    340 //  ^ ^ ^ ^   ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
    341 //  | | | |   | | | | | | | | | | | | | | | | | | | | | | | | | | |
    342 //  | | | |   payload: the remaining bits after the sign and size bits
    343 //  | | | |            and the delimiter bit, the value is 0x424242
    344 //  | | | |
    345 //  | size bits: 3 successive bits with the same value as the sign bit
    346 //  |            (followed by a delimiter bit with the opposite value)
    347 //  |            mean that there are 3 bytes after the first byte, 4 total
    348 //  |
    349 //  sign bit: 1 means that the number is non-negative
    350 //
    351 // Example 2: negative number -0x800 -> 2 byte big-endian hex string 0x3800:
    352 //
    353 // +---------------+---------------+
    354 //  0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
    355 // +---------------+---------------+
    356 //  ^ ^   ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
    357 //  | |   | | | | | | | | | | | | | | | | | | | | | | | | | | |
    358 //  | |   payload: the remaining bits after the sign and size bits and the
    359 //  | |            delimiter bit, 2-complement because of the negative sign,
    360 //  | |            value is ~0x7ff, represents the value -0x800
    361 //  | |
    362 //  | size bits: 1 bit with the same value as the sign bit
    363 //  |            (followed by a delimiter bit with the opposite value)
    364 //  |            means that there is 1 byte after the first byte, 2 total
    365 //  |
    366 //  sign bit: 0 means that the number is negative
    367 //
    368 //
    369 // Compared with the simpler unsigned format used for uint64 numbers,
    370 // this format is more compact for small numbers, namely one byte encodes
    371 // numbers in the range [-64,64), two bytes cover the range [-2^13,2^13), etc.
    372 // In general, n bytes encode numbers in the range [-2^(n*7-1),2^(n*7-1)).
    373 // (The cross-over point for compactness of representation is 8 bytes,
    374 // where this format only covers the range [-2^55,2^55),
    375 // whereas an encoding with sign bit and length in the first byte and
    376 // payload in all following bytes would cover [-2^56,2^56).)
    377 
    378 static const int kMaxSigned64Length = 10;
    379 
    380 // This array maps encoding length to header bits in the first two bytes.
    381 static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
    382     {0, 0},      {'\x80', 0},      {'\xc0', 0},     {'\xe0', 0},
    383     {'\xf0', 0}, {'\xf8', 0},      {'\xfc', 0},     {'\xfe', 0},
    384     {'\xff', 0}, {'\xff', '\x80'}, {'\xff', '\xc0'}};
    385 
    386 // This array maps encoding lengths to the header bits that overlap with
    387 // the payload and need fixing when reading.
    388 static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
    389     0ULL,
    390     0x80ULL,
    391     0xc000ULL,
    392     0xe00000ULL,
    393     0xf0000000ULL,
    394     0xf800000000ULL,
    395     0xfc0000000000ULL,
    396     0xfe000000000000ULL,
    397     0xff00000000000000ULL,
    398     0x8000000000000000ULL,
    399     0ULL};
    400 
    401 // This array maps the number of bits in a number to the encoding
    402 // length produced by WriteSignedNumIncreasing.
    403 // For positive numbers, the number of bits is 1 plus the most significant
    404 // bit position (the highest bit position in a positive int64 is 63).
    405 // For a negative number n, we count the bits in ~n.
    406 // That is, length = kBitsToLength[Bits::Log2Floor64(n < 0 ? ~n : n) + 1].
    407 static const int8 kBitsToLength[1 + 63] = {
    408     1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
    409     4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
    410     7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
    411 
    412 #if defined(__GNUC__)
    413 // Returns floor(lg(n)).  Returns -1 if n == 0.
    414 static int Log2Floor64(uint64 n) {
    415   return n == 0 ? -1 : 63 ^ __builtin_clzll(n);
    416 }
    417 #else
    418 // Portable slow version
    419 static int Log2Floor32_Portable(uint32 n) {
    420   if (n == 0) return -1;
    421   int log = 0;
    422   uint32 value = n;
    423   for (int i = 4; i >= 0; --i) {
    424     int shift = (1 << i);
    425     uint32 x = value >> shift;
    426     if (x != 0) {
    427       value = x;
    428       log += shift;
    429     }
    430   }
    431   assert(value == 1);
    432   return log;
    433 }
    434 // Returns floor(lg(n)).  Returns -1 if n == 0.
    435 static int Log2Floor64(uint64 n) {
    436   const uint32 topbits = static_cast<uint32>(n >> 32);
    437   if (topbits == 0) {
    438     // Top bits are zero, so scan in bottom bits
    439     return Log2Floor32_Portable(static_cast<uint32>(n));
    440   } else {
    441     return 32 + Log2Floor32_Portable(topbits);
    442   }
    443 }
    444 #endif
    445 
    446 // Calculates the encoding length in bytes of the signed number n.
    447 static inline int SignedEncodingLength(int64 n) {
    448   return kBitsToLength[Log2Floor64(n < 0 ? ~n : n) + 1];
    449 }
    450 
    451 static void StoreBigEndian64(char* dst, uint64 v) {
    452   for (int i = 0; i < 8; i++) {
    453     dst[i] = (v >> (56 - 8 * i)) & 0xff;
    454   }
    455 }
    456 
    457 static uint64 LoadBigEndian64(const char* src) {
    458   uint64 result = 0;
    459   for (int i = 0; i < 8; i++) {
    460     unsigned char c = static_cast<unsigned char>(src[i]);
    461     result |= static_cast<uint64>(c) << (56 - 8 * i);
    462   }
    463   return result;
    464 }
    465 
    466 void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
    467   const uint64 x = val < 0 ? ~val : val;
    468   if (x < 64) {  // fast path for encoding length == 1
    469     *dest += kLengthToHeaderBits[1][0] ^ val;
    470     return;
    471   }
    472   // buf = val in network byte order, sign extended to 10 bytes
    473   const char sign_byte = val < 0 ? '\xff' : '\0';
    474   char buf[10] = {
    475       sign_byte,
    476       sign_byte,
    477   };
    478   StoreBigEndian64(buf + 2, val);
    479   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
    480   const int len = SignedEncodingLength(x);
    481   DCHECK_GE(len, 2);
    482   char* const begin = buf + sizeof(buf) - len;
    483   begin[0] ^= kLengthToHeaderBits[len][0];
    484   begin[1] ^= kLengthToHeaderBits[len][1];  // ok because len >= 2
    485   dest->append(begin, len);
    486 }
    487 
    488 bool OrderedCode::ReadSignedNumIncreasing(StringPiece* src, int64* result) {
    489   if (src->empty()) return false;
    490   const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
    491   const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
    492 
    493   // now calculate and test length, and set x to raw (unmasked) result
    494   int len;
    495   uint64 x;
    496   if (first_byte != 0xff) {
    497     len = 7 - Log2Floor64(first_byte ^ 0xff);
    498     if (src->size() < static_cast<size_t>(len)) return false;
    499     x = xor_mask;  // sign extend using xor_mask
    500     for (int i = 0; i < len; ++i)
    501       x = (x << 8) | static_cast<unsigned char>((*src)[i]);
    502   } else {
    503     len = 8;
    504     if (src->size() < static_cast<size_t>(len)) return false;
    505     const unsigned char second_byte = (*src)[1] ^ (xor_mask & 0xff);
    506     if (second_byte >= 0x80) {
    507       if (second_byte < 0xc0) {
    508         len = 9;
    509       } else {
    510         const unsigned char third_byte = (*src)[2] ^ (xor_mask & 0xff);
    511         if (second_byte == 0xc0 && third_byte < 0x80) {
    512           len = 10;
    513         } else {
    514           return false;  // either len > 10 or len == 10 and #bits > 63
    515         }
    516       }
    517       if (src->size() < static_cast<size_t>(len)) return false;
    518     }
    519     x = LoadBigEndian64(src->data() + len - 8);
    520   }
    521 
    522   x ^= kLengthToMask[len];  // remove spurious header bits
    523 
    524   DCHECK_EQ(len, SignedEncodingLength(x)) << "invalid encoding";
    525 
    526   if (result) *result = x;
    527   src->remove_prefix(len);
    528   return true;
    529 }
    530 
    531 }  // namespace strings
    532 }  // namespace tensorflow
    533