Home | History | Annotate | Download | only in gn
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "tools/gn/tokenizer.h"
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_util.h"
      9 #include "tools/gn/input_file.h"
     10 
     11 namespace {
     12 
     13 bool CouldBeTwoCharOperatorBegin(char c) {
     14   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
     15          c == '+' || c == '|' || c == '&';
     16 }
     17 
     18 bool CouldBeTwoCharOperatorEnd(char c) {
     19   return c == '=' || c == '|' || c == '&';
     20 }
     21 
     22 bool CouldBeOneCharOperator(char c) {
     23   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
     24          c == ':' || c == '|' || c == '&' || c == '-';
     25 }
     26 
     27 bool CouldBeOperator(char c) {
     28   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
     29 }
     30 
     31 bool IsScoperChar(char c) {
     32   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
     33 }
     34 
     35 Token::Type GetSpecificOperatorType(base::StringPiece value) {
     36   if (value == "=")
     37     return Token::EQUAL;
     38   if (value == "+")
     39     return Token::PLUS;
     40   if (value == "-")
     41     return Token::MINUS;
     42   if (value == "+=")
     43     return Token::PLUS_EQUALS;
     44   if (value == "-=")
     45     return Token::MINUS_EQUALS;
     46   if (value == "==")
     47     return Token::EQUAL_EQUAL;
     48   if (value == "!=")
     49     return Token::NOT_EQUAL;
     50   if (value == "<=")
     51     return Token::LESS_EQUAL;
     52   if (value == ">=")
     53     return Token::GREATER_EQUAL;
     54   if (value == "<")
     55     return Token::LESS_THAN;
     56   if (value == ">")
     57     return Token::GREATER_THAN;
     58   if (value == "&&")
     59     return Token::BOOLEAN_AND;
     60   if (value == "||")
     61     return Token::BOOLEAN_OR;
     62   if (value == "!")
     63     return Token::BANG;
     64   if (value == ".")
     65     return Token::DOT;
     66   return Token::INVALID;
     67 }
     68 
     69 }  // namespace
     70 
     71 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
     72     : input_file_(input_file),
     73       input_(input_file->contents()),
     74       err_(err),
     75       cur_(0),
     76       line_number_(1),
     77       char_in_line_(1) {
     78 }
     79 
     80 Tokenizer::~Tokenizer() {
     81 }
     82 
     83 // static
     84 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
     85   Tokenizer t(input_file, err);
     86   return t.Run();
     87 }
     88 
     89 std::vector<Token> Tokenizer::Run() {
     90   DCHECK(tokens_.empty());
     91   while (!done()) {
     92     AdvanceToNextToken();
     93     if (done())
     94       break;
     95     Location location = GetCurrentLocation();
     96 
     97     Token::Type type = ClassifyCurrent();
     98     if (type == Token::INVALID) {
     99       *err_ = GetErrorForInvalidToken(location);
    100       break;
    101     }
    102     size_t token_begin = cur_;
    103     AdvanceToEndOfToken(location, type);
    104     if (has_error())
    105       break;
    106     size_t token_end = cur_;
    107 
    108     base::StringPiece token_value(&input_.data()[token_begin],
    109                                   token_end - token_begin);
    110 
    111     if (type == Token::UNCLASSIFIED_OPERATOR) {
    112       type = GetSpecificOperatorType(token_value);
    113     } else if (type == Token::IDENTIFIER) {
    114       if (token_value == "if")
    115         type = Token::IF;
    116       else if (token_value == "else")
    117         type = Token::ELSE;
    118       else if (token_value == "true")
    119         type = Token::TRUE_TOKEN;
    120       else if (token_value == "false")
    121         type = Token::FALSE_TOKEN;
    122     } else if (type == Token::UNCLASSIFIED_COMMENT) {
    123       if (AtStartOfLine(token_begin) &&
    124           // If it's a standalone comment, but is a continuation of a comment on
    125           // a previous line, then instead make it a continued suffix comment.
    126           (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
    127            tokens_.back().location().line_number() + 1 !=
    128                location.line_number() ||
    129            tokens_.back().location().char_offset() != location.char_offset())) {
    130         type = Token::LINE_COMMENT;
    131         Advance();  // The current \n.
    132         // If this comment is separated from the next syntax element, then we
    133         // want to tag it as a block comment. This will become a standalone
    134         // statement at the parser level to keep this comment separate, rather
    135         // than attached to the subsequent statement.
    136         while (!at_end() && IsCurrentWhitespace()) {
    137           if (IsCurrentNewline()) {
    138             type = Token::BLOCK_COMMENT;
    139             break;
    140           }
    141           Advance();
    142         }
    143       } else {
    144         type = Token::SUFFIX_COMMENT;
    145       }
    146     }
    147 
    148     tokens_.push_back(Token(location, type, token_value));
    149   }
    150   if (err_->has_error())
    151     tokens_.clear();
    152   return tokens_;
    153 }
    154 
    155 // static
    156 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
    157   DCHECK_GT(n, 0);
    158 
    159   if (n == 1)
    160     return 0;
    161 
    162   int cur_line = 1;
    163   size_t cur_byte = 0;
    164   while (cur_byte < buf.size()) {
    165     if (IsNewline(buf, cur_byte)) {
    166       cur_line++;
    167       if (cur_line == n)
    168         return cur_byte + 1;
    169     }
    170     cur_byte++;
    171   }
    172   return static_cast<size_t>(-1);
    173 }
    174 
    175 // static
    176 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
    177   DCHECK(offset < buffer.size());
    178   // We may need more logic here to handle different line ending styles.
    179   return buffer[offset] == '\n';
    180 }
    181 
    182 
    183 void Tokenizer::AdvanceToNextToken() {
    184   while (!at_end() && IsCurrentWhitespace())
    185     Advance();
    186 }
    187 
    188 Token::Type Tokenizer::ClassifyCurrent() const {
    189   DCHECK(!at_end());
    190   char next_char = cur_char();
    191   if (IsAsciiDigit(next_char))
    192     return Token::INTEGER;
    193   if (next_char == '"')
    194     return Token::STRING;
    195 
    196   // Note: '-' handled specially below.
    197   if (next_char != '-' && CouldBeOperator(next_char))
    198     return Token::UNCLASSIFIED_OPERATOR;
    199 
    200   if (IsIdentifierFirstChar(next_char))
    201     return Token::IDENTIFIER;
    202 
    203   if (next_char == '[')
    204     return Token::LEFT_BRACKET;
    205   if (next_char == ']')
    206     return Token::RIGHT_BRACKET;
    207   if (next_char == '(')
    208     return Token::LEFT_PAREN;
    209   if (next_char == ')')
    210     return Token::RIGHT_PAREN;
    211   if (next_char == '{')
    212     return Token::LEFT_BRACE;
    213   if (next_char == '}')
    214     return Token::RIGHT_BRACE;
    215 
    216   if (next_char == '.')
    217     return Token::DOT;
    218   if (next_char == ',')
    219     return Token::COMMA;
    220 
    221   if (next_char == '#')
    222     return Token::UNCLASSIFIED_COMMENT;
    223 
    224   // For the case of '-' differentiate between a negative number and anything
    225   // else.
    226   if (next_char == '-') {
    227     if (!CanIncrement())
    228       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
    229                                             // file.
    230     char following_char = input_[cur_ + 1];
    231     if (IsAsciiDigit(following_char))
    232       return Token::INTEGER;
    233     return Token::UNCLASSIFIED_OPERATOR;
    234   }
    235 
    236   return Token::INVALID;
    237 }
    238 
    239 void Tokenizer::AdvanceToEndOfToken(const Location& location,
    240                                     Token::Type type) {
    241   switch (type) {
    242     case Token::INTEGER:
    243       do {
    244         Advance();
    245       } while (!at_end() && IsAsciiDigit(cur_char()));
    246       if (!at_end()) {
    247         // Require the char after a number to be some kind of space, scope,
    248         // or operator.
    249         char c = cur_char();
    250         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
    251             !IsScoperChar(c) && c != ',') {
    252           *err_ = Err(GetCurrentLocation(),
    253                       "This is not a valid number.",
    254                       "Learn to count.");
    255           // Highlight the number.
    256           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
    257         }
    258       }
    259       break;
    260 
    261     case Token::STRING: {
    262       char initial = cur_char();
    263       Advance();  // Advance past initial "
    264       for (;;) {
    265         if (at_end()) {
    266           *err_ = Err(LocationRange(location, GetCurrentLocation()),
    267                       "Unterminated string literal.",
    268                       "Don't leave me hanging like this!");
    269           break;
    270         }
    271         if (IsCurrentStringTerminator(initial)) {
    272           Advance();  // Skip past last "
    273           break;
    274         } else if (cur_char() == '\n') {
    275           *err_ = Err(LocationRange(location, GetCurrentLocation()),
    276                       "Newline in string constant.");
    277         }
    278         Advance();
    279       }
    280       break;
    281     }
    282 
    283     case Token::UNCLASSIFIED_OPERATOR:
    284       // Some operators are two characters, some are one.
    285       if (CouldBeTwoCharOperatorBegin(cur_char())) {
    286         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
    287           Advance();
    288       }
    289       Advance();
    290       break;
    291 
    292     case Token::IDENTIFIER:
    293       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
    294         Advance();
    295       break;
    296 
    297     case Token::LEFT_BRACKET:
    298     case Token::RIGHT_BRACKET:
    299     case Token::LEFT_BRACE:
    300     case Token::RIGHT_BRACE:
    301     case Token::LEFT_PAREN:
    302     case Token::RIGHT_PAREN:
    303     case Token::DOT:
    304     case Token::COMMA:
    305       Advance();  // All are one char.
    306       break;
    307 
    308     case Token::UNCLASSIFIED_COMMENT:
    309       // Eat to EOL.
    310       while (!at_end() && !IsCurrentNewline())
    311         Advance();
    312       break;
    313 
    314     case Token::INVALID:
    315     default:
    316       *err_ = Err(location, "Everything is all messed up",
    317                   "Please insert system disk in drive A: and press any key.");
    318       NOTREACHED();
    319       return;
    320   }
    321 }
    322 
    323 bool Tokenizer::AtStartOfLine(size_t location) const {
    324   while (location > 0) {
    325     --location;
    326     char c = input_[location];
    327     if (c == '\n')
    328       return true;
    329     if (c != ' ')
    330       return false;
    331   }
    332   return true;
    333 }
    334 
    335 bool Tokenizer::IsCurrentWhitespace() const {
    336   DCHECK(!at_end());
    337   char c = input_[cur_];
    338   // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
    339   return c == 0x0A || c == 0x0D || c == 0x20;
    340 }
    341 
    342 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
    343   DCHECK(!at_end());
    344   if (cur_char() != quote_char)
    345     return false;
    346 
    347   // Check for escaping. \" is not a string terminator, but \\" is. Count
    348   // the number of preceeding backslashes.
    349   int num_backslashes = 0;
    350   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
    351     num_backslashes++;
    352 
    353   // Even backslashes mean that they were escaping each other and don't count
    354   // as escaping this quote.
    355   return (num_backslashes % 2) == 0;
    356 }
    357 
    358 bool Tokenizer::IsCurrentNewline() const {
    359   return IsNewline(input_, cur_);
    360 }
    361 
    362 void Tokenizer::Advance() {
    363   DCHECK(cur_ < input_.size());
    364   if (IsCurrentNewline()) {
    365     line_number_++;
    366     char_in_line_ = 1;
    367   } else {
    368     char_in_line_++;
    369   }
    370   cur_++;
    371 }
    372 
    373 Location Tokenizer::GetCurrentLocation() const {
    374   return Location(
    375       input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
    376 }
    377 
    378 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
    379   std::string help;
    380   if (cur_char() == ';') {
    381     // Semicolon.
    382     help = "Semicolons are not needed, delete this one.";
    383   } else if (cur_char() == '\t') {
    384     // Tab.
    385     help = "You got a tab character in here. Tabs are evil. "
    386            "Convert to spaces.";
    387   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
    388       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
    389     // Different types of comments.
    390     help = "Comments should start with # instead";
    391   } else {
    392     help = "I have no idea what this is.";
    393   }
    394 
    395   return Err(location, "Invalid token.", help);
    396 }
    397