Home | History | Annotate | Download | only in strings
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
     17 #define TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
     18 
     19 #include <functional>
     20 #include <string>
     21 #include <vector>
     22 #include "tensorflow/core/lib/core/stringpiece.h"
     23 #include "tensorflow/core/lib/gtl/array_slice.h"
     24 #include "tensorflow/core/lib/strings/strcat.h"
     25 #include "tensorflow/core/platform/types.h"
     26 
     27 // Basic string utility routines
     28 namespace tensorflow {
     29 namespace str_util {
     30 
     31 // Returns a version of 'src' where unprintable characters have been
     32 // escaped using C-style escape sequences.
     33 string CEscape(StringPiece src);
     34 
     35 // Copies "source" to "dest", rewriting C-style escape sequences --
     36 // '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
     37 //
     38 // Errors: Sets the description of the first encountered error in
     39 // 'error'. To disable error reporting, set 'error' to NULL.
     40 //
     41 // NOTE: Does not support \u or \U!
     42 bool CUnescape(StringPiece source, string* dest, string* error);
     43 
     44 // Removes any trailing whitespace from "*s".
     45 void StripTrailingWhitespace(string* s);
     46 
     47 // Removes leading ascii_isspace() characters.
     48 // Returns number of characters removed.
     49 size_t RemoveLeadingWhitespace(StringPiece* text);
     50 
     51 // Removes trailing ascii_isspace() characters.
     52 // Returns number of characters removed.
     53 size_t RemoveTrailingWhitespace(StringPiece* text);
     54 
     55 // Removes leading and trailing ascii_isspace() chars.
     56 // Returns number of chars removed.
     57 size_t RemoveWhitespaceContext(StringPiece* text);
     58 
     59 // Consume a leading positive integer value.  If any digits were
     60 // found, store the value of the leading unsigned number in "*val",
     61 // advance "*s" past the consumed number, and return true.  If
     62 // overflow occurred, returns false.  Otherwise, returns false.
     63 bool ConsumeLeadingDigits(StringPiece* s, uint64* val);
     64 
     65 // Consume a leading token composed of non-whitespace characters only.
     66 // If *s starts with a non-zero number of non-whitespace characters, store
     67 // them in *val, advance *s past them, and return true.  Else return false.
     68 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val);
     69 
     70 // If "*s" starts with "expected", consume it and return true.
     71 // Otherwise, return false.
     72 bool ConsumePrefix(StringPiece* s, StringPiece expected);
     73 
     74 // If "*s" ends with "expected", remove it and return true.
     75 // Otherwise, return false.
     76 bool ConsumeSuffix(StringPiece* s, StringPiece expected);
     77 
     78 // Return lower-cased version of s.
     79 string Lowercase(StringPiece s);
     80 
     81 // Return upper-cased version of s.
     82 string Uppercase(StringPiece s);
     83 
     84 // Converts "^2ILoveYou!" to "i_love_you_". More specifically:
     85 // - converts all non-alphanumeric characters to underscores
     86 // - replaces each occurrence of a capital letter (except the very
     87 //   first character and if there is already an '_' before it) with '_'
     88 //   followed by this letter in lower case
     89 // - Skips leading non-alpha characters
     90 // This method is useful for producing strings matching "[a-z][a-z0-9_]*"
     91 // as required by OpDef.ArgDef.name. The resulting string is either empty or
     92 // matches this regex.
     93 string ArgDefCase(StringPiece s);
     94 
     95 // Capitalize first character of each word in "*s".  "delimiters" is a
     96 // set of characters that can be used as word boundaries.
     97 void TitlecaseString(string* s, StringPiece delimiters);
     98 
     99 // Replaces the first occurrence (if replace_all is false) or all occurrences
    100 // (if replace_all is true) of oldsub in s with newsub.
    101 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
    102                      bool replace_all);
    103 
    104 // Join functionality
    105 template <typename T>
    106 string Join(const T& s, const char* sep);
    107 
    108 // A variant of Join where for each element of "s", f(&dest_string, elem)
    109 // is invoked (f is often constructed with a lambda of the form:
    110 //   [](string* result, ElemType elem)
    111 template <typename T, typename Formatter>
    112 string Join(const T& s, const char* sep, Formatter f);
    113 
    114 struct AllowEmpty {
    115   bool operator()(StringPiece sp) const { return true; }
    116 };
    117 struct SkipEmpty {
    118   bool operator()(StringPiece sp) const { return !sp.empty(); }
    119 };
    120 struct SkipWhitespace {
    121   bool operator()(StringPiece sp) const {
    122     RemoveTrailingWhitespace(&sp);
    123     return !sp.empty();
    124   }
    125 };
    126 
    127 // Split strings using any of the supplied delimiters. For example:
    128 // Split("a,b.c,d", ".,") would return {"a", "b", "c", "d"}.
    129 std::vector<string> Split(StringPiece text, StringPiece delims);
    130 
    131 template <typename Predicate>
    132 std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p);
    133 
    134 // Split "text" at "delim" characters, and parse each component as
    135 // an integer.  If successful, adds the individual numbers in order
    136 // to "*result" and returns true.  Otherwise returns false.
    137 bool SplitAndParseAsInts(StringPiece text, char delim,
    138                          std::vector<int32>* result);
    139 bool SplitAndParseAsInts(StringPiece text, char delim,
    140                          std::vector<int64>* result);
    141 bool SplitAndParseAsFloats(StringPiece text, char delim,
    142                            std::vector<float>* result);
    143 
    144 // ------------------------------------------------------------------
    145 // Implementation details below
    146 template <typename T>
    147 string Join(const T& s, const char* sep) {
    148   string result;
    149   bool first = true;
    150   for (const auto& x : s) {
    151     tensorflow::strings::StrAppend(&result, (first ? "" : sep), x);
    152     first = false;
    153   }
    154   return result;
    155 }
    156 
    157 template <typename T>
    158 class Formatter {
    159  public:
    160   Formatter(std::function<void(string*, T)> f) : f_(f) {}
    161   void operator()(string* out, const T& t) { f_(out, t); }
    162 
    163  private:
    164   std::function<void(string*, T)> f_;
    165 };
    166 
    167 template <typename T, typename Formatter>
    168 string Join(const T& s, const char* sep, Formatter f) {
    169   string result;
    170   bool first = true;
    171   for (const auto& x : s) {
    172     if (!first) {
    173       result.append(sep);
    174     }
    175     f(&result, x);
    176     first = false;
    177   }
    178   return result;
    179 }
    180 
    181 inline std::vector<string> Split(StringPiece text, StringPiece delims) {
    182   return Split(text, delims, AllowEmpty());
    183 }
    184 
    185 template <typename Predicate>
    186 std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p) {
    187   std::vector<string> result;
    188   size_t token_start = 0;
    189   if (!text.empty()) {
    190     for (size_t i = 0; i < text.size() + 1; i++) {
    191       if ((i == text.size()) || (delims.find(text[i]) != StringPiece::npos)) {
    192         StringPiece token(text.data() + token_start, i - token_start);
    193         if (p(token)) {
    194           result.push_back(token.ToString());
    195         }
    196         token_start = i + 1;
    197       }
    198     }
    199   }
    200   return result;
    201 }
    202 
    203 inline std::vector<string> Split(StringPiece text, char delim) {
    204   return Split(text, StringPiece(&delim, 1));
    205 }
    206 
    207 template <typename Predicate>
    208 std::vector<string> Split(StringPiece text, char delims, Predicate p) {
    209   return Split(text, StringPiece(&delims, 1), p);
    210 }
    211 
    212 }  // namespace str_util
    213 }  // namespace tensorflow
    214 
    215 #endif  // TENSORFLOW_LIB_STRINGS_STR_UTIL_H_
    216