Home | History | Annotate | Download | only in gn
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "tools/gn/tokenizer.h"
      6 
      7 #include "base/logging.h"
      8 #include "tools/gn/input_file.h"
      9 
     10 namespace {
     11 
     12 bool IsNumberChar(char c) {
     13   return c >= '0' && c <= '9';
     14 }
     15 
     16 bool CouldBeTwoCharOperatorBegin(char c) {
     17   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
     18          c == '+' || c == '|' || c == '&';
     19 }
     20 
     21 bool CouldBeTwoCharOperatorEnd(char c) {
     22   return c == '=' || c == '|' || c == '&';
     23 }
     24 
     25 bool CouldBeOneCharOperator(char c) {
     26   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
     27          c == ':' || c == '|' || c == '&' || c == '-';
     28 }
     29 
     30 bool CouldBeOperator(char c) {
     31   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
     32 }
     33 
     34 bool IsScoperChar(char c) {
     35   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
     36 }
     37 
     38 Token::Type GetSpecificOperatorType(base::StringPiece value) {
     39   if (value == "=")
     40     return Token::EQUAL;
     41   if (value == "+")
     42     return Token::PLUS;
     43   if (value == "-")
     44     return Token::MINUS;
     45   if (value == "+=")
     46     return Token::PLUS_EQUALS;
     47   if (value == "-=")
     48     return Token::MINUS_EQUALS;
     49   if (value == "==")
     50     return Token::EQUAL_EQUAL;
     51   if (value == "!=")
     52     return Token::NOT_EQUAL;
     53   if (value == "<=")
     54     return Token::LESS_EQUAL;
     55   if (value == ">=")
     56     return Token::GREATER_EQUAL;
     57   if (value == "<")
     58     return Token::LESS_THAN;
     59   if (value == ">")
     60     return Token::GREATER_THAN;
     61   if (value == "&&")
     62     return Token::BOOLEAN_AND;
     63   if (value == "||")
     64     return Token::BOOLEAN_OR;
     65   if (value == "!")
     66     return Token::BANG;
     67   return Token::INVALID;
     68 }
     69 
     70 }  // namespace
     71 
     72 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
     73     : input_file_(input_file),
     74       input_(input_file->contents()),
     75       err_(err),
     76       cur_(0),
     77       line_number_(1),
     78       char_in_line_(1) {
     79 }
     80 
     81 Tokenizer::~Tokenizer() {
     82 }
     83 
     84 // static
     85 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
     86   Tokenizer t(input_file, err);
     87   return t.Run();
     88 }
     89 
     90 std::vector<Token> Tokenizer::Run() {
     91   DCHECK(tokens_.empty());
     92   while (!done()) {
     93     AdvanceToNextToken();
     94     if (done())
     95       break;
     96     Location location = GetCurrentLocation();
     97 
     98     Token::Type type = ClassifyCurrent();
     99     if (type == Token::INVALID) {
    100       *err_ = GetErrorForInvalidToken(location);
    101       break;
    102     }
    103     size_t token_begin = cur_;
    104     AdvanceToEndOfToken(location, type);
    105     if (has_error())
    106       break;
    107     size_t token_end = cur_;
    108 
    109     base::StringPiece token_value(&input_.data()[token_begin],
    110                                   token_end - token_begin);
    111 
    112     if (type == Token::UNCLASSIFIED_OPERATOR)
    113       type = GetSpecificOperatorType(token_value);
    114     if (type == Token::IDENTIFIER) {
    115       if (token_value == "if")
    116         type = Token::IF;
    117       else if (token_value == "else")
    118         type = Token::ELSE;
    119       else if (token_value == "true")
    120         type = Token::TRUE_TOKEN;
    121       else if (token_value == "false")
    122         type = Token::FALSE_TOKEN;
    123     }
    124 
    125     // TODO(brettw) This just strips comments from the token stream. This
    126     // is probably wrong, they should be removed at a later stage so we can
    127     // do things like rewrite the file. But this makes the parser simpler and
    128     // is OK for now.
    129     if (type != Token::COMMENT)
    130       tokens_.push_back(Token(location, type, token_value));
    131   }
    132   if (err_->has_error())
    133     tokens_.clear();
    134   return tokens_;
    135 }
    136 
    137 // static
    138 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
    139   int cur_line = 1;
    140   size_t cur_byte = 0;
    141 
    142   DCHECK(n > 0);
    143 
    144   if (n == 1)
    145     return 0;
    146 
    147   while (cur_byte < buf.size()) {
    148     if (IsNewline(buf, cur_byte)) {
    149       cur_line++;
    150       if (cur_line == n)
    151         return cur_byte + 1;
    152     }
    153     cur_byte++;
    154   }
    155   return -1;
    156 }
    157 
    158 // static
    159 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
    160   DCHECK(offset < buffer.size());
    161   // We may need more logic here to handle different line ending styles.
    162   return buffer[offset] == '\n';
    163 }
    164 
    165 
    166 void Tokenizer::AdvanceToNextToken() {
    167   while (!at_end() && IsCurrentWhitespace())
    168     Advance();
    169 }
    170 
    171 Token::Type Tokenizer::ClassifyCurrent() const {
    172   DCHECK(!at_end());
    173   char next_char = cur_char();
    174   if (next_char >= '0' && next_char <= '9')
    175     return Token::INTEGER;
    176   if (next_char == '"')
    177     return Token::STRING;
    178 
    179   // Note: '-' handled specially below.
    180   if (next_char != '-' && CouldBeOperator(next_char))
    181     return Token::UNCLASSIFIED_OPERATOR;
    182 
    183   if (IsIdentifierFirstChar(next_char))
    184     return Token::IDENTIFIER;
    185 
    186   if (next_char == '[')
    187     return Token::LEFT_BRACKET;
    188   if (next_char == ']')
    189     return Token::RIGHT_BRACKET;
    190   if (next_char == '(')
    191     return Token::LEFT_PAREN;
    192   if (next_char == ')')
    193     return Token::RIGHT_PAREN;
    194   if (next_char == '{')
    195     return Token::LEFT_BRACE;
    196   if (next_char == '}')
    197     return Token::RIGHT_BRACE;
    198 
    199   if (next_char == ',')
    200     return Token::COMMA;
    201 
    202   if (next_char == '#')
    203     return Token::COMMENT;
    204 
    205   // For the case of '-' differentiate between a negative number and anything
    206   // else.
    207   if (next_char == '-') {
    208     if (!CanIncrement())
    209       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
    210                                             // file.
    211     char following_char = input_[cur_ + 1];
    212     if (following_char >= '0' && following_char <= '9')
    213       return Token::INTEGER;
    214     return Token::UNCLASSIFIED_OPERATOR;
    215   }
    216 
    217   return Token::INVALID;
    218 }
    219 
    220 void Tokenizer::AdvanceToEndOfToken(const Location& location,
    221                                     Token::Type type) {
    222   switch (type) {
    223     case Token::INTEGER:
    224       do {
    225         Advance();
    226       } while (!at_end() && IsNumberChar(cur_char()));
    227       if (!at_end()) {
    228         // Require the char after a number to be some kind of space, scope,
    229         // or operator.
    230         char c = cur_char();
    231         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
    232             !IsScoperChar(c) && c != ',') {
    233           *err_ = Err(GetCurrentLocation(),
    234               "This is not a valid number.",
    235               "Learn to count.");
    236           // Highlight the number.
    237           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
    238         }
    239       }
    240       break;
    241 
    242     case Token::STRING: {
    243       char initial = cur_char();
    244       Advance();  // Advance past initial "
    245       for (;;) {
    246         if (at_end()) {
    247           *err_ = Err(LocationRange(location,
    248                           Location(input_file_, line_number_, char_in_line_)),
    249                      "Unterminated string literal.",
    250                      "Don't leave me hanging like this!");
    251           break;
    252         }
    253         if (IsCurrentStringTerminator(initial)) {
    254           Advance();  // Skip past last "
    255           break;
    256         } else if (cur_char() == '\n') {
    257           *err_ = Err(LocationRange(location,
    258                                    GetCurrentLocation()),
    259                      "Newline in string constant.");
    260         }
    261         Advance();
    262       }
    263       break;
    264     }
    265 
    266     case Token::UNCLASSIFIED_OPERATOR:
    267       // Some operators are two characters, some are one.
    268       if (CouldBeTwoCharOperatorBegin(cur_char())) {
    269         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
    270           Advance();
    271       }
    272       Advance();
    273       break;
    274 
    275     case Token::IDENTIFIER:
    276       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
    277         Advance();
    278       break;
    279 
    280     case Token::LEFT_BRACKET:
    281     case Token::RIGHT_BRACKET:
    282     case Token::LEFT_BRACE:
    283     case Token::RIGHT_BRACE:
    284     case Token::LEFT_PAREN:
    285     case Token::RIGHT_PAREN:
    286     case Token::COMMA:
    287       Advance();  // All are one char.
    288       break;
    289 
    290     case Token::COMMENT:
    291       // Eat to EOL.
    292       while (!at_end() && !IsCurrentNewline())
    293         Advance();
    294       break;
    295 
    296     case Token::INVALID:
    297     default:
    298       *err_ = Err(location, "Everything is all messed up",
    299                   "Please insert system disk in drive A: and press any key.");
    300       NOTREACHED();
    301       return;
    302   }
    303 }
    304 
    305 bool Tokenizer::IsCurrentWhitespace() const {
    306   DCHECK(!at_end());
    307   char c = input_[cur_];
    308   // Note that tab (0x09) is illegal.
    309   return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
    310 }
    311 
    312 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
    313   DCHECK(!at_end());
    314   if (cur_char() != quote_char)
    315     return false;
    316 
    317   // Check for escaping. \" is not a string terminator, but \\" is. Count
    318   // the number of preceeding backslashes.
    319   int num_backslashes = 0;
    320   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
    321     num_backslashes++;
    322 
    323   // Even backslashes mean that they were escaping each other and don't count
    324   // as escaping this quote.
    325   return (num_backslashes % 2) == 0;
    326 }
    327 
    328 bool Tokenizer::IsCurrentNewline() const {
    329   return IsNewline(input_, cur_);
    330 }
    331 
    332 void Tokenizer::Advance() {
    333   DCHECK(cur_ < input_.size());
    334   if (IsCurrentNewline()) {
    335     line_number_++;
    336     char_in_line_ = 1;
    337   } else {
    338     char_in_line_++;
    339   }
    340   cur_++;
    341 }
    342 
    343 Location Tokenizer::GetCurrentLocation() const {
    344   return Location(input_file_, line_number_, char_in_line_);
    345 }
    346 
    347 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
    348   std::string help;
    349   if (cur_char() == ';') {
    350     // Semicolon.
    351     help = "Semicolons are not needed, delete this one.";
    352   } else if (cur_char() == '\t') {
    353     // Tab.
    354     help = "You got a tab character in here. Tabs are evil. "
    355            "Convert to spaces.";
    356   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
    357       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
    358     // Different types of comments.
    359     help = "Comments should start with # instead";
    360   } else {
    361     help = "I have no idea what this is.";
    362   }
    363 
    364   return Err(location, "Invalid token.", help);
    365 }
    366