Home | History | Annotate | Download | only in strings
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_
     17 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_
     18 
     19 #include <string>
     20 #include "tensorflow/core/lib/core/stringpiece.h"
     21 #include "tensorflow/core/lib/strings/str_util.h"
     22 #include "tensorflow/core/platform/macros.h"
     23 
     24 namespace tensorflow {
     25 namespace strings {
     26 
     27 // Scanner provides simplified string parsing, in which a string is parsed as a
     28 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then
     29 // finally GetResult is called. If GetResult returns true, then it also returns
     30 // the remaining characters and any captured substring.
     31 //
     32 // The range to capture can be controlled with RestartCapture and StopCapture;
     33 // by default, all processed characters are captured.
     34 class Scanner {
     35  public:
     36   // Classes of characters. Each enum name is to be read as the union of the
     37   // parts - e.g., class LETTER_DIGIT means the class includes all letters and
     38   // all digits.
     39   //
     40   // LETTER means ascii letter a-zA-Z.
     41   // DIGIT means ascii digit: 0-9.
     42   enum CharClass {
     43     // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest
     44     // in scanner_test.cc
     45     ALL,
     46     DIGIT,
     47     LETTER,
     48     LETTER_DIGIT,
     49     LETTER_DIGIT_DASH_UNDERSCORE,
     50     LETTER_DIGIT_DASH_DOT_SLASH,             // SLASH is / only, not backslash
     51     LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE,  // SLASH is / only, not backslash
     52     LETTER_DIGIT_DOT,
     53     LETTER_DIGIT_DOT_PLUS_MINUS,
     54     LETTER_DIGIT_DOT_UNDERSCORE,
     55     LETTER_DIGIT_UNDERSCORE,
     56     LOWERLETTER,
     57     LOWERLETTER_DIGIT,
     58     LOWERLETTER_DIGIT_UNDERSCORE,
     59     NON_ZERO_DIGIT,
     60     SPACE,
     61     UPPERLETTER,
     62   };
     63 
     64   explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); }
     65 
     66   // Consume the next character of the given class from input. If the next
     67   // character is not in the class, then GetResult will ultimately return false.
     68   Scanner& One(CharClass clz) {
     69     if (cur_.empty() || !Matches(clz, cur_[0])) {
     70       return Error();
     71     }
     72     cur_.remove_prefix(1);
     73     return *this;
     74   }
     75 
     76   // Consume the next s.size() characters of the input, if they match <s>. If
     77   // they don't match <s>, this is a no-op.
     78   Scanner& ZeroOrOneLiteral(StringPiece s) {
     79     str_util::ConsumePrefix(&cur_, s);
     80     return *this;
     81   }
     82 
     83   // Consume the next s.size() characters of the input, if they match <s>. If
     84   // they don't match <s>, then GetResult will ultimately return false.
     85   Scanner& OneLiteral(StringPiece s) {
     86     if (!str_util::ConsumePrefix(&cur_, s)) {
     87       error_ = true;
     88     }
     89     return *this;
     90   }
     91 
     92   // Consume characters from the input as long as they match <clz>. Zero
     93   // characters is still considered a match, so it will never cause GetResult to
     94   // return false.
     95   Scanner& Any(CharClass clz) {
     96     while (!cur_.empty() && Matches(clz, cur_[0])) {
     97       cur_.remove_prefix(1);
     98     }
     99     return *this;
    100   }
    101 
    102   // Shorthand for One(clz).Any(clz).
    103   Scanner& Many(CharClass clz) { return One(clz).Any(clz); }
    104 
    105   // Reset the capture start point.
    106   //
    107   // Later, when GetResult is called and if it returns true, the capture
    108   // returned will start at the position at the time this was called.
    109   Scanner& RestartCapture() {
    110     capture_start_ = cur_.data();
    111     capture_end_ = nullptr;
    112     return *this;
    113   }
    114 
    115   // Stop capturing input.
    116   //
    117   // Later, when GetResult is called and if it returns true, the capture
    118   // returned will end at the position at the time this was called.
    119   Scanner& StopCapture() {
    120     capture_end_ = cur_.data();
    121     return *this;
    122   }
    123 
    124   // If not at the input of input, then GetResult will ultimately return false.
    125   Scanner& Eos() {
    126     if (!cur_.empty()) error_ = true;
    127     return *this;
    128   }
    129 
    130   // Shorthand for Any(SPACE).
    131   Scanner& AnySpace() { return Any(SPACE); }
    132 
    133   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
    134   Scanner& ScanUntil(char end_ch) {
    135     ScanUntilImpl(end_ch, false);
    136     return *this;
    137   }
    138 
    139   // This scans input until <end_ch> is reached. <end_ch> is NOT consumed.
    140   // Backslash escape sequences are skipped.
    141   // Used for implementing quoted string scanning.
    142   Scanner& ScanEscapedUntil(char end_ch) {
    143     ScanUntilImpl(end_ch, true);
    144     return *this;
    145   }
    146 
    147   // Return the next character that will be scanned, or <default_value> if there
    148   // are no more characters to scan.
    149   // Note that if a scan operation has failed (so GetResult() returns false),
    150   // then the value of Peek may or may not have advanced since the scan
    151   // operation that failed.
    152   char Peek(char default_value = '\0') const {
    153     return cur_.empty() ? default_value : cur_[0];
    154   }
    155 
    156   // Returns false if there are no remaining characters to consume.
    157   int empty() const { return cur_.empty(); }
    158 
    159   // Returns true if the input string successfully matched. When true is
    160   // returned, the remaining string is returned in <remaining> and the captured
    161   // string returned in <capture>, if non-NULL.
    162   bool GetResult(StringPiece* remaining = nullptr,
    163                  StringPiece* capture = nullptr);
    164 
    165  private:
    166   void ScanUntilImpl(char end_ch, bool escaped);
    167 
    168   Scanner& Error() {
    169     error_ = true;
    170     return *this;
    171   }
    172 
    173   static bool IsLetter(char ch) {
    174     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    175   }
    176 
    177   static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; }
    178 
    179   static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; }
    180 
    181   static bool IsSpace(char ch) {
    182     return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
    183             ch == '\r');
    184   }
    185 
    186   static bool Matches(CharClass clz, char ch) {
    187     switch (clz) {
    188       case ALL:
    189         return true;
    190       case DIGIT:
    191         return IsDigit(ch);
    192       case LETTER:
    193         return IsLetter(ch);
    194       case LETTER_DIGIT:
    195         return IsLetter(ch) || IsDigit(ch);
    196       case LETTER_DIGIT_DASH_UNDERSCORE:
    197         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_');
    198       case LETTER_DIGIT_DASH_DOT_SLASH:
    199         return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
    200                ch == '/';
    201       case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE:
    202         return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' ||
    203                 ch == '/' || ch == '_');
    204       case LETTER_DIGIT_DOT:
    205         return IsLetter(ch) || IsDigit(ch) || ch == '.';
    206       case LETTER_DIGIT_DOT_PLUS_MINUS:
    207         return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' ||
    208                ch == '.';
    209       case LETTER_DIGIT_DOT_UNDERSCORE:
    210         return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_';
    211       case LETTER_DIGIT_UNDERSCORE:
    212         return IsLetter(ch) || IsDigit(ch) || ch == '_';
    213       case LOWERLETTER:
    214         return ch >= 'a' && ch <= 'z';
    215       case LOWERLETTER_DIGIT:
    216         return IsLowerLetter(ch) || IsDigit(ch);
    217       case LOWERLETTER_DIGIT_UNDERSCORE:
    218         return IsLowerLetter(ch) || IsDigit(ch) || ch == '_';
    219       case NON_ZERO_DIGIT:
    220         return IsDigit(ch) && ch != '0';
    221       case SPACE:
    222         return IsSpace(ch);
    223       case UPPERLETTER:
    224         return ch >= 'A' && ch <= 'Z';
    225     }
    226     return false;
    227   }
    228 
    229   StringPiece cur_;
    230   const char* capture_start_ = nullptr;
    231   const char* capture_end_ = nullptr;
    232   bool error_ = false;
    233 
    234   friend class ScannerTest;
    235   TF_DISALLOW_COPY_AND_ASSIGN(Scanner);
    236 };
    237 
    238 }  // namespace strings
    239 }  // namespace tensorflow
    240 
    241 #endif  // TENSORFLOW_LIB_STRINGS_SCANNER_H_
    242