Home | History | Annotate | Download | only in strings
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_
     17 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_
     18 
     19 #include <string>
     20 #include "tensorflow/core/lib/core/stringpiece.h"
     21 #include "tensorflow/core/platform/macros.h"
     22 
     23 namespace tensorflow {
     24 namespace strings {
     25 
     26 // Scanner provides simplified string parsing, in which a string is parsed as a
     27 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
     28 // finally GetResult is called. If GetResult returns true, then it also returns
     29 // the remaining characters and any captured substring.
     30 //
     31 // The range to capture can be controlled with RestartCapture and StopCapture;
     32 // by default, all processed characters are captured.
     33 class Scanner {
     34  public:
     35   // Classes of characters. Each enum name is to be read as the union of the
     36   // parts - e.g., class LETTER_DIGIT means the class includes all letters and
     37   // all digits.
     38   //
     39   // LETTER means ascii letter a-zA-Z.
     40   // DIGIT means ascii digit: 0-9.
     41   enum CharClass {
     42     // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
     43     // in scanner_test.cc
     44     ALL,
     45     DIGIT,
     46     LETTER,
     47     LETTER_DIGIT,
     48     LETTER_DIGIT_DASH_UNDERSCORE,
     49     LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
     50     LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
     51     LETTER_DIGIT_DOT,
     52     LETTER_DIGIT_DOT_PLUS_MINUS,
     53     LETTER_DIGIT_DOT_UNDERSCORE,
     54     LETTER_DIGIT_UNDERSCORE,
     55     LOWERLETTER,
     56     LOWERLETTER_DIGIT,
     57     LOWERLETTER_DIGIT_UNDERSCORE,
     58     NON_ZERO_DIGIT,
     59     SPACE,
     60     UPPERLETTER,
     61   };
     62 
     63   explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
     64 
     65   // Consume the next character of the given class from input. If the next
     66   // character is not in the class, then GetResult will ultimately return false.
     67   Scanner& One(CharClass clz) {
     68     if (cur_.empty() || !Matches(clz, cur_[0])) {
     69       return Error();
     70     }
     71     cur_.remove_prefix(1);
     72     return *this;
     73   }
     74 
     75   // Consume the next s.size() characters of the input, if they match <s>. If
     76   // they don't match <s>, this is a no-op.
     77   Scanner& ZeroOrOneLiteral(StringPiece s) {
     78     cur_.Consume(s);
     79     return *this;
     80   }
     81 
     82   // Consume the next s.size() characters of the input, if they match <s>. If
     83   // they don't match <s>, then GetResult will ultimately return false.
     84   Scanner& OneLiteral(StringPiece s) {
     85     if (!cur_.Consume(s)) {
     86       error_ = true;
     87     }
     88     return *this;
     89   }
     90 
     91   // Consume characters from the input as long as they match <clz>. Zero
     92   // characters is still considered a match, so it will never cause GetResult to
     93   // return false.
     94   Scanner& Any(CharClass clz) {
     95     while (!cur_.empty() && Matches(clz, cur_[0])) {
     96       cur_.remove_prefix(1);
     97     }
     98     return *this;
     99   }
    100 
    101   // Shorthand for One(clz).Any(clz).
    102   Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
    103 
    104   // Reset the capture start point.
    105   //
    106   // Later, when GetResult is called and if it returns true, the capture
    107   // returned will start at the position at the time this was called.
    108   Scanner& RestartCapture() {
    109     capture_start_ = cur_.data();
    110     capture_end_ = nullptr;
    111     return *this;
    112   }
    113 
    114   // Stop capturing input.
    115   //
    116   // Later, when GetResult is called and if it returns true, the capture
    117   // returned will end at the position at the time this was called.
    118   Scanner& StopCapture() {
    119     capture_end_ = cur_.data();
    120     return *this;
    121   }
    122 
    123   // If not at the input of input, then GetResult will ultimately return false.
    124   Scanner& Eos() {
    125     if (!cur_.empty()) error_ = true;
    126     return *this;
    127   }
    128 
    129   // Shorthand for Any(SPACE).
    130   Scanner& AnySpace() { return Any(SPACE); }
    131 
    132   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
    133   Scanner& ScanUntil(char end_ch) {
    134     ScanUntilImpl(end_ch, false);
    135     return *this;
    136   }
    137 
    138   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
    139   // Backslash escape sequences are skipped.
    140   // Used for implementing quoted string scanning.
    141   Scanner& ScanEscapedUntil(char end_ch) {
    142     ScanUntilImpl(end_ch, true);
    143     return *this;
    144   }
    145 
    146   // Return the next character that will be scanned, or <default_value> if there
    147   // are no more characters to scan.
    148   // Note that if a scan operation has failed (so GetResult() returns false),
    149   // then the value of Peek may or may not have advanced since the scan
    150   // operation that failed.
    151   char Peek(char default_value = '\0') const {
    152     return cur_.empty() ? default_value : cur_[0];
    153   }
    154 
    155   // Returns false if there are no remaining characters to consume.
    156   int empty() const { return cur_.empty(); }
    157 
    158   // Returns true if the input string successfully matched. When true is
    159   // returned, the remaining string is returned in <remaining> and the captured
    160   // string returned in <capture>, if non-NULL.
    161   bool GetResult(StringPiece* remaining = nullptr,
    162                  StringPiece* capture = nullptr);
    163 
    164  private:
    165   void ScanUntilImpl(char end_ch, bool escaped);
    166 
    167   Scanner& Error() {
    168     error_ = true;
    169     return *this;
    170   }
    171 
    172   static bool IsLetter(char ch) {
    173     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    174   }
    175 
    176   static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
    177 
    178   static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
    179 
    180   static bool IsSpace(char ch) {
    181     return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
    182             ch == '\r');
    183   }
    184 
    185   static bool Matches(CharClass clz, char ch) {
    186     switch (clz) {
    187       case ALL:
    188         return true;
    189       case DIGIT:
    190         return IsDigit(ch);
    191       case LETTER:
    192         return IsLetter(ch);
    193       case LETTER_DIGIT:
    194         return IsLetter(ch) || IsDigit(ch);
    195       case LETTER_DIGIT_DASH_UNDERSCORE:
    196         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
    197       case LETTER_DIGIT_DASH_DOT_SLASH:
    198         return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
    199                ch == '/';
    200       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
    201         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
    202                 ch == '/' || ch == '_');
    203       case LETTER_DIGIT_DOT:
    204         return IsLetter(ch) || IsDigit(ch) || ch == '.';
    205       case LETTER_DIGIT_DOT_PLUS_MINUS:
    206         return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
    207                ch == '.';
    208       case LETTER_DIGIT_DOT_UNDERSCORE:
    209         return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
    210       case LETTER_DIGIT_UNDERSCORE:
    211         return IsLetter(ch) || IsDigit(ch) || ch == '_';
    212       case LOWERLETTER:
    213         return ch >= 'a' && ch <= 'z';
    214       case LOWERLETTER_DIGIT:
    215         return IsLowerLetter(ch) || IsDigit(ch);
    216       case LOWERLETTER_DIGIT_UNDERSCORE:
    217         return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
    218       case NON_ZERO_DIGIT:
    219         return IsDigit(ch) && ch != '0';
    220       case SPACE:
    221         return IsSpace(ch);
    222       case UPPERLETTER:
    223         return ch >= 'A' && ch <= 'Z';
    224     }
    225     return false;
    226   }
    227 
    228   StringPiece cur_;
    229   const char* capture_start_ = nullptr;
    230   const char* capture_end_ = nullptr;
    231   bool error_ = false;
    232 
    233   friend class ScannerTest;
    234   TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
    235 };
    236 
    237 }  // namespace strings
    238 }  // namespace tensorflow
    239 
    240 #endif  // TENSORFLOW_LIB_STRINGS_SCANNER_H_
    241