Home | History | Annotate | Download | only in strings
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/lib/strings/str_util.h"
     17 
     18 #include <ctype.h>
     19 #include <algorithm>
     20 #include <cstring>
     21 #include <vector>
     22 #include "tensorflow/core/lib/strings/numbers.h"
     23 #include "tensorflow/core/lib/strings/stringprintf.h"
     24 #include "tensorflow/core/platform/logging.h"
     25 
     26 namespace tensorflow {
     27 namespace str_util {
     28 
     29 static char hex_char[] = "0123456789abcdef";
     30 
     31 string CEscape(StringPiece src) {
     32   string dest;
     33 
     34   for (unsigned char c : src) {
     35     switch (c) {
     36       case '\n':
     37         dest.append("\\n");
     38         break;
     39       case '\r':
     40         dest.append("\\r");
     41         break;
     42       case '\t':
     43         dest.append("\\t");
     44         break;
     45       case '\"':
     46         dest.append("\\\"");
     47         break;
     48       case '\'':
     49         dest.append("\\'");
     50         break;
     51       case '\\':
     52         dest.append("\\\\");
     53         break;
     54       default:
     55         // Note that if we emit \xNN and the src character after that is a hex
     56         // digit then that digit must be escaped too to prevent it being
     57         // interpreted as part of the character code by C.
     58         if ((c >= 0x80) || !isprint(c)) {
     59           dest.append("\\");
     60           dest.push_back(hex_char[c / 64]);
     61           dest.push_back(hex_char[(c % 64) / 8]);
     62           dest.push_back(hex_char[c % 8]);
     63         } else {
     64           dest.push_back(c);
     65           break;
     66         }
     67     }
     68   }
     69 
     70   return dest;
     71 }
     72 
     73 namespace {  // Private helpers for CUnescape().
     74 
     75 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; }
     76 
     77 inline bool ascii_isxdigit(unsigned char c) {
     78   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
     79          (c >= 'A' && c <= 'F');
     80 }
     81 
     82 inline int hex_digit_to_int(char c) {
     83   int x = static_cast<unsigned char>(c);
     84   if (x > '9') {
     85     x += 9;
     86   }
     87   return x & 0xf;
     88 }
     89 
     90 bool CUnescapeInternal(StringPiece source, string* dest,
     91                        string::size_type* dest_len, string* error) {
     92   const char* p = source.data();
     93   const char* end = source.end();
     94   const char* last_byte = end - 1;
     95 
     96   // We are going to write the result to dest with its iterator. If our string
     97   // implementation uses copy-on-write, this will trigger a copy-on-write of
     98   // dest's buffer; that is, dest will be assigned a new buffer.
     99   //
    100   // Note that the following way is NOT a legal way to modify a string's
    101   // content:
    102   //
    103   //  char* d = const_cast<char*>(dest->data());
    104   //
    105   // This won't trigger copy-on-write of the string, and so is dangerous when
    106   // the buffer is shared.
    107   auto d = dest->begin();
    108 
    109   // Small optimization for case where source = dest and there's no escaping
    110   if (source.data() == dest->data()) {
    111     while (p < end && *p != '\\') {
    112       p++;
    113       d++;
    114     }
    115   }
    116 
    117   while (p < end) {
    118     if (*p != '\\') {
    119       *d++ = *p++;
    120     } else {
    121       if (++p > last_byte) {  // skip past the '\\'
    122         if (error) *error = "String cannot end with \\";
    123         return false;
    124       }
    125       switch (*p) {
    126         case 'a':
    127           *d++ = '\a';
    128           break;
    129         case 'b':
    130           *d++ = '\b';
    131           break;
    132         case 'f':
    133           *d++ = '\f';
    134           break;
    135         case 'n':
    136           *d++ = '\n';
    137           break;
    138         case 'r':
    139           *d++ = '\r';
    140           break;
    141         case 't':
    142           *d++ = '\t';
    143           break;
    144         case 'v':
    145           *d++ = '\v';
    146           break;
    147         case '\\':
    148           *d++ = '\\';
    149           break;
    150         case '?':
    151           *d++ = '\?';
    152           break;  // \?  Who knew?
    153         case '\'':
    154           *d++ = '\'';
    155           break;
    156         case '"':
    157           *d++ = '\"';
    158           break;
    159         case '0':
    160         case '1':
    161         case '2':
    162         case '3':  // octal digit: 1 to 3 digits
    163         case '4':
    164         case '5':
    165         case '6':
    166         case '7': {
    167           const char* octal_start = p;
    168           unsigned int ch = *p - '0';
    169           if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
    170           if (p < last_byte && is_octal_digit(p[1]))
    171             ch = ch * 8 + *++p - '0';  // now points at last digit
    172           if (ch > 0xff) {
    173             if (error) {
    174               *error = "Value of \\" +
    175                        string(octal_start, p + 1 - octal_start) +
    176                        " exceeds 0xff";
    177             }
    178             return false;
    179           }
    180           *d++ = ch;
    181           break;
    182         }
    183         case 'x':
    184         case 'X': {
    185           if (p >= last_byte) {
    186             if (error) *error = "String cannot end with \\x";
    187             return false;
    188           } else if (!ascii_isxdigit(p[1])) {
    189             if (error) *error = "\\x cannot be followed by a non-hex digit";
    190             return false;
    191           }
    192           unsigned int ch = 0;
    193           const char* hex_start = p;
    194           while (p < last_byte && ascii_isxdigit(p[1]))
    195             // Arbitrarily many hex digits
    196             ch = (ch << 4) + hex_digit_to_int(*++p);
    197           if (ch > 0xFF) {
    198             if (error) {
    199               *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
    200                        " exceeds 0xff";
    201             }
    202             return false;
    203           }
    204           *d++ = ch;
    205           break;
    206         }
    207         default: {
    208           if (error) *error = string("Unknown escape sequence: \\") + *p;
    209           return false;
    210         }
    211       }
    212       p++;  // read past letter we escaped
    213     }
    214   }
    215   *dest_len = d - dest->begin();
    216   return true;
    217 }
    218 
    219 template <typename T>
    220 bool SplitAndParseAsInts(StringPiece text, char delim,
    221                          std::function<bool(StringPiece, T*)> converter,
    222                          std::vector<T>* result) {
    223   result->clear();
    224   std::vector<string> num_strings = Split(text, delim);
    225   for (const auto& s : num_strings) {
    226     T num;
    227     if (!converter(s, &num)) return false;
    228     result->push_back(num);
    229   }
    230   return true;
    231 }
    232 
    233 }  // namespace
    234 
    235 bool CUnescape(StringPiece source, string* dest, string* error) {
    236   dest->resize(source.size());
    237   string::size_type dest_size;
    238   if (!CUnescapeInternal(source, dest, &dest_size, error)) {
    239     return false;
    240   }
    241   dest->erase(dest_size);
    242   return true;
    243 }
    244 
    245 void StripTrailingWhitespace(string* s) {
    246   string::size_type i;
    247   for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) {
    248   }
    249   s->resize(i);
    250 }
    251 
    252 // Return lower-cased version of s.
    253 string Lowercase(StringPiece s) {
    254   string result(s.data(), s.size());
    255   for (char& c : result) {
    256     c = tolower(c);
    257   }
    258   return result;
    259 }
    260 
    261 // Return upper-cased version of s.
    262 string Uppercase(StringPiece s) {
    263   string result(s.data(), s.size());
    264   for (char& c : result) {
    265     c = toupper(c);
    266   }
    267   return result;
    268 }
    269 
    270 string ArgDefCase(StringPiece s) {
    271   const size_t n = s.size();
    272 
    273   // Compute the size of resulting string.
    274   // Number of extra underscores we will need to add.
    275   size_t extra_us = 0;
    276   // Number of non-alpha chars in the beginning to skip.
    277   size_t to_skip = 0;
    278   for (size_t i = 0; i < n; ++i) {
    279     // If we are skipping and current letter is non-alpha, skip it as well
    280     if (i == to_skip && !isalpha(s[i])) {
    281       ++to_skip;
    282       continue;
    283     }
    284 
    285     // If we are here, we are not skipping any more.
    286     // If this letter is upper case, not the very first char in the
    287     // resulting string, and previous letter isn't replaced with an underscore,
    288     // we will need to insert an underscore.
    289     if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
    290       ++extra_us;
    291     }
    292   }
    293 
    294   // Initialize result with all '_'s. There is no string
    295   // constructor that does not initialize memory.
    296   string result(n + extra_us - to_skip, '_');
    297   // i - index into s
    298   // j - index into result
    299   for (size_t i = to_skip, j = 0; i < n; ++i, ++j) {
    300     DCHECK_LT(j, result.size());
    301     char c = s[i];
    302     // If c is not alphanumeric, we don't need to do anything
    303     // since there is already an underscore in its place.
    304     if (isalnum(c)) {
    305       if (isupper(c)) {
    306         // If current char is upper case, we might need to insert an
    307         // underscore.
    308         if (i != to_skip) {
    309           DCHECK_GT(j, 0);
    310           if (result[j - 1] != '_') ++j;
    311         }
    312         result[j] = tolower(c);
    313       } else {
    314         result[j] = c;
    315       }
    316     }
    317   }
    318 
    319   return result;
    320 }
    321 
    322 void TitlecaseString(string* s, StringPiece delimiters) {
    323   bool upper = true;
    324   for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
    325     if (upper) {
    326       *ss = toupper(*ss);
    327     }
    328     upper = (delimiters.find(*ss) != StringPiece::npos);
    329   }
    330 }
    331 
    332 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
    333                      bool replace_all) {
    334   // TODO(jlebar): We could avoid having to shift data around in the string if
    335   // we had a StringPiece::find() overload that searched for a StringPiece.
    336   string res(s);
    337   size_t pos = 0;
    338   while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
    339     res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
    340     pos += newsub.size();
    341     if (oldsub.empty()) {
    342       pos++;  // Match at the beginning of the text and after every byte
    343     }
    344     if (!replace_all) {
    345       break;
    346     }
    347   }
    348   return res;
    349 }
    350 
    351 size_t RemoveLeadingWhitespace(StringPiece* text) {
    352   size_t count = 0;
    353   const char* ptr = text->data();
    354   while (count < text->size() && isspace(*ptr)) {
    355     count++;
    356     ptr++;
    357   }
    358   text->remove_prefix(count);
    359   return count;
    360 }
    361 
    362 size_t RemoveTrailingWhitespace(StringPiece* text) {
    363   size_t count = 0;
    364   const char* ptr = text->data() + text->size() - 1;
    365   while (count < text->size() && isspace(*ptr)) {
    366     ++count;
    367     --ptr;
    368   }
    369   text->remove_suffix(count);
    370   return count;
    371 }
    372 
    373 size_t RemoveWhitespaceContext(StringPiece* text) {
    374   // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
    375   return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text));
    376 }
    377 
    378 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
    379   if (StartsWith(*s, expected)) {
    380     s->remove_prefix(expected.size());
    381     return true;
    382   }
    383   return false;
    384 }
    385 
    386 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
    387   if (EndsWith(*s, expected)) {
    388     s->remove_suffix(expected.size());
    389     return true;
    390   }
    391   return false;
    392 }
    393 
    394 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
    395   const char* p = s->data();
    396   const char* limit = p + s->size();
    397   uint64 v = 0;
    398   while (p < limit) {
    399     const char c = *p;
    400     if (c < '0' || c > '9') break;
    401     uint64 new_v = (v * 10) + (c - '0');
    402     if (new_v / 8 < v) {
    403       // Overflow occurred
    404       return false;
    405     }
    406     v = new_v;
    407     p++;
    408   }
    409   if (p > s->data()) {
    410     // Consume some digits
    411     s->remove_prefix(p - s->data());
    412     *val = v;
    413     return true;
    414   } else {
    415     return false;
    416   }
    417 }
    418 
    419 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
    420   const char* p = s->data();
    421   const char* limit = p + s->size();
    422   while (p < limit) {
    423     const char c = *p;
    424     if (isspace(c)) break;
    425     p++;
    426   }
    427   const size_t n = p - s->data();
    428   if (n > 0) {
    429     *val = StringPiece(s->data(), n);
    430     s->remove_prefix(n);
    431     return true;
    432   } else {
    433     *val = StringPiece();
    434     return false;
    435   }
    436 }
    437 
    438 bool SplitAndParseAsInts(StringPiece text, char delim,
    439                          std::vector<int32>* result) {
    440   return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result);
    441 }
    442 
    443 bool SplitAndParseAsInts(StringPiece text, char delim,
    444                          std::vector<int64>* result) {
    445   return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result);
    446 }
    447 
    448 bool SplitAndParseAsFloats(StringPiece text, char delim,
    449                            std::vector<float>* result) {
    450   return SplitAndParseAsInts<float>(text, delim,
    451                                     [](StringPiece str, float* value) {
    452                                       return strings::safe_strtof(str, value);
    453                                     },
    454                                     result);
    455 }
    456 
    457 size_t Strnlen(const char* str, const size_t string_max_len) {
    458   size_t len = 0;
    459   while (len < string_max_len && str[len] != '\0') {
    460     ++len;
    461   }
    462   return len;
    463 }
    464 
    465 bool StrContains(StringPiece haystack, StringPiece needle) {
    466   return std::search(haystack.begin(), haystack.end(), needle.begin(),
    467                      needle.end()) != haystack.end();
    468 }
    469 
    470 bool StartsWith(StringPiece text, StringPiece prefix) {
    471   return prefix.empty() ||
    472          (text.size() >= prefix.size() &&
    473           memcmp(text.data(), prefix.data(), prefix.size()) == 0);
    474 }
    475 
    476 bool EndsWith(StringPiece text, StringPiece suffix) {
    477   return suffix.empty() || (text.size() >= suffix.size() &&
    478                             memcmp(text.data() + (text.size() - suffix.size()),
    479                                    suffix.data(), suffix.size()) == 0);
    480 }
    481 
    482 }  // namespace str_util
    483 }  // namespace tensorflow
    484