Home | History | Annotate | Download | only in strings
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/lib/strings/str_util.h"
     17 
     18 #include <ctype.h>
     19 #include <vector>
     20 #include "tensorflow/core/lib/strings/numbers.h"
     21 #include "tensorflow/core/lib/strings/stringprintf.h"
     22 
     23 namespace tensorflow {
     24 namespace str_util {
     25 
     26 static char hex_char[] = "0123456789abcdef";
     27 
     28 string CEscape(StringPiece src) {
     29   string dest;
     30 
     31   for (unsigned char c : src) {
     32     switch (c) {
     33       case '\n':
     34         dest.append("\\n");
     35         break;
     36       case '\r':
     37         dest.append("\\r");
     38         break;
     39       case '\t':
     40         dest.append("\\t");
     41         break;
     42       case '\"':
     43         dest.append("\\\"");
     44         break;
     45       case '\'':
     46         dest.append("\\'");
     47         break;
     48       case '\\':
     49         dest.append("\\\\");
     50         break;
     51       default:
     52         // Note that if we emit \xNN and the src character after that is a hex
     53         // digit then that digit must be escaped too to prevent it being
     54         // interpreted as part of the character code by C.
     55         if ((c >= 0x80) || !isprint(c)) {
     56           dest.append("\\");
     57           dest.push_back(hex_char[c / 64]);
     58           dest.push_back(hex_char[(c % 64) / 8]);
     59           dest.push_back(hex_char[c % 8]);
     60         } else {
     61           dest.push_back(c);
     62           break;
     63         }
     64     }
     65   }
     66 
     67   return dest;
     68 }
     69 
     70 namespace {  // Private helpers for CUnescape().
     71 
     72 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; }
     73 
     74 inline bool ascii_isxdigit(unsigned char c) {
     75   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
     76          (c >= 'A' && c <= 'F');
     77 }
     78 
     79 inline int hex_digit_to_int(char c) {
     80   int x = static_cast<unsigned char>(c);
     81   if (x > '9') {
     82     x += 9;
     83   }
     84   return x & 0xf;
     85 }
     86 
     87 bool CUnescapeInternal(StringPiece source, string* dest,
     88                        string::size_type* dest_len, string* error) {
     89   const char* p = source.data();
     90   const char* end = source.end();
     91   const char* last_byte = end - 1;
     92 
     93   // We are going to write the result to dest with its iterator. If our string
     94   // implementation uses copy-on-write, this will trigger a copy-on-write of
     95   // dest's buffer; that is, dest will be assigned a new buffer.
     96   //
     97   // Note that the following way is NOT a legal way to modify a string's
     98   // content:
     99   //
    100   //  char* d = const_cast<char*>(dest->data());
    101   //
    102   // This won't trigger copy-on-write of the string, and so is dangerous when
    103   // the buffer is shared.
    104   auto d = dest->begin();
    105 
    106   // Small optimization for case where source = dest and there's no escaping
    107   if (source.data() == dest->data()) {
    108     while (p < end && *p != '\\') {
    109       p++;
    110       d++;
    111     }
    112   }
    113 
    114   while (p < end) {
    115     if (*p != '\\') {
    116       *d++ = *p++;
    117     } else {
    118       if (++p > last_byte) {  // skip past the '\\'
    119         if (error) *error = "String cannot end with \\";
    120         return false;
    121       }
    122       switch (*p) {
    123         case 'a':
    124           *d++ = '\a';
    125           break;
    126         case 'b':
    127           *d++ = '\b';
    128           break;
    129         case 'f':
    130           *d++ = '\f';
    131           break;
    132         case 'n':
    133           *d++ = '\n';
    134           break;
    135         case 'r':
    136           *d++ = '\r';
    137           break;
    138         case 't':
    139           *d++ = '\t';
    140           break;
    141         case 'v':
    142           *d++ = '\v';
    143           break;
    144         case '\\':
    145           *d++ = '\\';
    146           break;
    147         case '?':
    148           *d++ = '\?';
    149           break;  // \?  Who knew?
    150         case '\'':
    151           *d++ = '\'';
    152           break;
    153         case '"':
    154           *d++ = '\"';
    155           break;
    156         case '0':
    157         case '1':
    158         case '2':
    159         case '3':  // octal digit: 1 to 3 digits
    160         case '4':
    161         case '5':
    162         case '6':
    163         case '7': {
    164           const char* octal_start = p;
    165           unsigned int ch = *p - '0';
    166           if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
    167           if (p < last_byte && is_octal_digit(p[1]))
    168             ch = ch * 8 + *++p - '0';  // now points at last digit
    169           if (ch > 0xff) {
    170             if (error) {
    171               *error = "Value of \\" +
    172                        string(octal_start, p + 1 - octal_start) +
    173                        " exceeds 0xff";
    174             }
    175             return false;
    176           }
    177           *d++ = ch;
    178           break;
    179         }
    180         case 'x':
    181         case 'X': {
    182           if (p >= last_byte) {
    183             if (error) *error = "String cannot end with \\x";
    184             return false;
    185           } else if (!ascii_isxdigit(p[1])) {
    186             if (error) *error = "\\x cannot be followed by a non-hex digit";
    187             return false;
    188           }
    189           unsigned int ch = 0;
    190           const char* hex_start = p;
    191           while (p < last_byte && ascii_isxdigit(p[1]))
    192             // Arbitrarily many hex digits
    193             ch = (ch << 4) + hex_digit_to_int(*++p);
    194           if (ch > 0xFF) {
    195             if (error) {
    196               *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
    197                        " exceeds 0xff";
    198             }
    199             return false;
    200           }
    201           *d++ = ch;
    202           break;
    203         }
    204         default: {
    205           if (error) *error = string("Unknown escape sequence: \\") + *p;
    206           return false;
    207         }
    208       }
    209       p++;  // read past letter we escaped
    210     }
    211   }
    212   *dest_len = d - dest->begin();
    213   return true;
    214 }
    215 
    216 template <typename T>
    217 bool SplitAndParseAsInts(StringPiece text, char delim,
    218                          std::function<bool(StringPiece, T*)> converter,
    219                          std::vector<T>* result) {
    220   result->clear();
    221   std::vector<string> num_strings = Split(text, delim);
    222   for (const auto& s : num_strings) {
    223     T num;
    224     if (!converter(s, &num)) return false;
    225     result->push_back(num);
    226   }
    227   return true;
    228 }
    229 
    230 }  // namespace
    231 
    232 bool CUnescape(StringPiece source, string* dest, string* error) {
    233   dest->resize(source.size());
    234   string::size_type dest_size;
    235   if (!CUnescapeInternal(source, dest, &dest_size, error)) {
    236     return false;
    237   }
    238   dest->erase(dest_size);
    239   return true;
    240 }
    241 
    242 void StripTrailingWhitespace(string* s) {
    243   string::size_type i;
    244   for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) {
    245   }
    246   s->resize(i);
    247 }
    248 
    249 // Return lower-cased version of s.
    250 string Lowercase(StringPiece s) {
    251   string result(s.data(), s.size());
    252   for (char& c : result) {
    253     c = tolower(c);
    254   }
    255   return result;
    256 }
    257 
    258 // Return upper-cased version of s.
    259 string Uppercase(StringPiece s) {
    260   string result(s.data(), s.size());
    261   for (char& c : result) {
    262     c = toupper(c);
    263   }
    264   return result;
    265 }
    266 
    267 string ArgDefCase(StringPiece s) {
    268   const size_t n = s.size();
    269 
    270   // Compute the size of resulting string.
    271   // Number of extra underscores we will need to add.
    272   size_t extra_us = 0;
    273   // Number of non-alpha chars in the beginning to skip.
    274   size_t to_skip = 0;
    275   for (size_t i = 0; i < n; ++i) {
    276     // If we are skipping and current letter is non-alpha, skip it as well
    277     if (i == to_skip && !isalpha(s[i])) {
    278       ++to_skip;
    279       continue;
    280     }
    281 
    282     // If we are here, we are not skipping any more.
    283     // If this letter is upper case, not the very first char in the
    284     // resulting string, and previous letter isn't replaced with an underscore,
    285     // we will need to insert an underscore.
    286     if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
    287       ++extra_us;
    288     }
    289   }
    290 
    291   // Initialize result with all '_'s. There is no string
    292   // constructor that does not initialize memory.
    293   string result(n + extra_us - to_skip, '_');
    294   // i - index into s
    295   // j - index into result
    296   for (size_t i = to_skip, j = 0; i < n; ++i, ++j) {
    297     DCHECK_LT(j, result.size());
    298     char c = s[i];
    299     // If c is not alphanumeric, we don't need to do anything
    300     // since there is already an underscore in its place.
    301     if (isalnum(c)) {
    302       if (isupper(c)) {
    303         // If current char is upper case, we might need to insert an
    304         // underscore.
    305         if (i != to_skip) {
    306           DCHECK_GT(j, 0);
    307           if (result[j - 1] != '_') ++j;
    308         }
    309         result[j] = tolower(c);
    310       } else {
    311         result[j] = c;
    312       }
    313     }
    314   }
    315 
    316   return result;
    317 }
    318 
    319 void TitlecaseString(string* s, StringPiece delimiters) {
    320   bool upper = true;
    321   for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
    322     if (upper) {
    323       *ss = toupper(*ss);
    324     }
    325     upper = (delimiters.find(*ss) != StringPiece::npos);
    326   }
    327 }
    328 
    329 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
    330                      bool replace_all) {
    331   // TODO(jlebar): We could avoid having to shift data around in the string if
    332   // we had a StringPiece::find() overload that searched for a StringPiece.
    333   string res = s.ToString();
    334   size_t pos = 0;
    335   while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
    336     res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
    337     pos += newsub.size();
    338     if (oldsub.empty()) {
    339       pos++;  // Match at the beginning of the text and after every byte
    340     }
    341     if (!replace_all) {
    342       break;
    343     }
    344   }
    345   return res;
    346 }
    347 
    348 size_t RemoveLeadingWhitespace(StringPiece* text) {
    349   size_t count = 0;
    350   const char* ptr = text->data();
    351   while (count < text->size() && isspace(*ptr)) {
    352     count++;
    353     ptr++;
    354   }
    355   text->remove_prefix(count);
    356   return count;
    357 }
    358 
    359 size_t RemoveTrailingWhitespace(StringPiece* text) {
    360   size_t count = 0;
    361   const char* ptr = text->data() + text->size() - 1;
    362   while (count < text->size() && isspace(*ptr)) {
    363     ++count;
    364     --ptr;
    365   }
    366   text->remove_suffix(count);
    367   return count;
    368 }
    369 
    370 size_t RemoveWhitespaceContext(StringPiece* text) {
    371   // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
    372   return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text));
    373 }
    374 
    375 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
    376   if (s->starts_with(expected)) {
    377     s->remove_prefix(expected.size());
    378     return true;
    379   }
    380   return false;
    381 }
    382 
    383 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
    384   if (s->ends_with(expected)) {
    385     s->remove_suffix(expected.size());
    386     return true;
    387   }
    388   return false;
    389 }
    390 
    391 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
    392   const char* p = s->data();
    393   const char* limit = p + s->size();
    394   uint64 v = 0;
    395   while (p < limit) {
    396     const char c = *p;
    397     if (c < '0' || c > '9') break;
    398     uint64 new_v = (v * 10) + (c - '0');
    399     if (new_v / 8 < v) {
    400       // Overflow occurred
    401       return false;
    402     }
    403     v = new_v;
    404     p++;
    405   }
    406   if (p > s->data()) {
    407     // Consume some digits
    408     s->remove_prefix(p - s->data());
    409     *val = v;
    410     return true;
    411   } else {
    412     return false;
    413   }
    414 }
    415 
    416 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
    417   const char* p = s->data();
    418   const char* limit = p + s->size();
    419   while (p < limit) {
    420     const char c = *p;
    421     if (isspace(c)) break;
    422     p++;
    423   }
    424   const size_t n = p - s->data();
    425   if (n > 0) {
    426     *val = StringPiece(s->data(), n);
    427     s->remove_prefix(n);
    428     return true;
    429   } else {
    430     *val = StringPiece();
    431     return false;
    432   }
    433 }
    434 
    435 bool SplitAndParseAsInts(StringPiece text, char delim,
    436                          std::vector<int32>* result) {
    437   return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result);
    438 }
    439 
    440 bool SplitAndParseAsInts(StringPiece text, char delim,
    441                          std::vector<int64>* result) {
    442   return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result);
    443 }
    444 
    445 bool SplitAndParseAsFloats(StringPiece text, char delim,
    446                            std::vector<float>* result) {
    447   return SplitAndParseAsInts<float>(text, delim,
    448                                     [](StringPiece str, float* value) {
    449                                       return strings::safe_strtof(
    450                                           str.ToString().c_str(), value);
    451                                     },
    452                                     result);
    453 }
    454 
    455 }  // namespace str_util
    456 }  // namespace tensorflow
    457