Home | History | Annotate | Download | only in gn
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "tools/gn/tokenizer.h"
      6 
      7 #include "base/logging.h"
      8 #include "tools/gn/input_file.h"
      9 
     10 namespace {
     11 
     12 bool IsNumberChar(char c) {
     13   return c == '-' || (c >= '0' && c <= '9');
     14 }
     15 
     16 bool CouldBeTwoCharOperatorBegin(char c) {
     17   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
     18          c == '+' || c == '|' || c == '&';
     19 }
     20 
     21 bool CouldBeTwoCharOperatorEnd(char c) {
     22   return c == '=' || c == '|' || c == '&';
     23 }
     24 
     25 bool CouldBeOneCharOperator(char c) {
     26   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
     27          c == ':' || c == '|' || c == '&' || c == '-';
     28 }
     29 
     30 bool CouldBeOperator(char c) {
     31   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
     32 }
     33 
     34 bool IsSeparatorChar(char c) {
     35   return c == ',';
     36 }
     37 
     38 bool IsScoperChar(char c) {
     39   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
     40 }
     41 
     42 }  // namespace
     43 
     44 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
     45     : input_file_(input_file),
     46       input_(input_file->contents()),
     47       err_(err),
     48       cur_(0),
     49       line_number_(1),
     50       char_in_line_(1) {
     51 }
     52 
     53 Tokenizer::~Tokenizer() {
     54 }
     55 
     56 // static
     57 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
     58   Tokenizer t(input_file, err);
     59   return t.Run();
     60 }
     61 
     62 std::vector<Token> Tokenizer::Run() {
     63   std::vector<Token> tokens;
     64   while (!done()) {
     65     AdvanceToNextToken();
     66     if (done())
     67       break;
     68     Location location = GetCurrentLocation();
     69 
     70     Token::Type type = ClassifyCurrent();
     71     if (type == Token::INVALID) {
     72       *err_ = GetErrorForInvalidToken(location);
     73       break;
     74     }
     75     size_t token_begin = cur_;
     76     AdvanceToEndOfToken(location, type);
     77     if (has_error())
     78       break;
     79     size_t token_end = cur_;
     80 
     81     // TODO(brettw) This just strips comments from the token stream. This
     82     // is probably wrong, they should be removed at a later stage so we can
     83     // do things like rewrite the file. But this makes the parser simpler and
     84     // is OK for now.
     85     if (type != Token::COMMENT) {
     86       tokens.push_back(Token(
     87           location,
     88           type,
     89           base::StringPiece(&input_.data()[token_begin],
     90                             token_end - token_begin)));
     91     }
     92   }
     93   if (err_->has_error())
     94     tokens.clear();
     95   return tokens;
     96 }
     97 
     98 // static
     99 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
    100   int cur_line = 1;
    101   size_t cur_byte = 0;
    102 
    103   DCHECK(n > 0);
    104 
    105   if (n == 1)
    106     return 0;
    107 
    108   while (cur_byte < buf.size()) {
    109     if (IsNewline(buf, cur_byte)) {
    110       cur_line++;
    111       if (cur_line == n)
    112         return cur_byte + 1;
    113     }
    114     cur_byte++;
    115   }
    116   return -1;
    117 }
    118 
    119 // static
    120 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
    121   DCHECK(offset < buffer.size());
    122   // We may need more logic here to handle different line ending styles.
    123   return buffer[offset] == '\n';
    124 }
    125 
    126 
    127 void Tokenizer::AdvanceToNextToken() {
    128   while (!at_end() && IsCurrentWhitespace())
    129     Advance();
    130 }
    131 
    132 Token::Type Tokenizer::ClassifyCurrent() const {
    133   DCHECK(!at_end());
    134   char next_char = cur_char();
    135   if (next_char >= '0' && next_char <= '9')
    136     return Token::INTEGER;
    137   if (next_char == '"')
    138     return Token::STRING;
    139 
    140   // Note: '-' handled specially below.
    141   if (next_char != '-' && CouldBeOperator(next_char))
    142     return Token::OPERATOR;
    143 
    144   if (IsIdentifierFirstChar(next_char))
    145     return Token::IDENTIFIER;
    146 
    147   if (IsScoperChar(next_char))
    148     return Token::SCOPER;
    149 
    150   if (IsSeparatorChar(next_char))
    151     return Token::SEPARATOR;
    152 
    153   if (next_char == '#')
    154     return Token::COMMENT;
    155 
    156   // For the case of '-' differentiate between a negative number and anything
    157   // else.
    158   if (next_char == '-') {
    159     if (!CanIncrement())
    160       return Token::OPERATOR;  // Just the minus before end of file.
    161     char following_char = input_[cur_ + 1];
    162     if (following_char >= '0' && following_char <= '9')
    163       return Token::INTEGER;
    164     return Token::OPERATOR;
    165   }
    166 
    167   return Token::INVALID;
    168 }
    169 
    170 void Tokenizer::AdvanceToEndOfToken(const Location& location,
    171                                     Token::Type type) {
    172   switch (type) {
    173     case Token::INTEGER:
    174       do {
    175         Advance();
    176       } while (!at_end() && IsNumberChar(cur_char()));
    177       if (!at_end()) {
    178         // Require the char after a number to be some kind of space, scope,
    179         // or operator.
    180         char c = cur_char();
    181         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
    182             !IsScoperChar(c) && !IsSeparatorChar(c)) {
    183           *err_ = Err(GetCurrentLocation(),
    184               "This is not a valid number.",
    185               "Learn to count.");
    186           // Highlight the number.
    187           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
    188         }
    189       }
    190       break;
    191 
    192     case Token::STRING: {
    193       char initial = cur_char();
    194       Advance();  // Advance past initial "
    195       for (;;) {
    196         if (at_end()) {
    197           *err_ = Err(LocationRange(location,
    198                           Location(input_file_, line_number_, char_in_line_)),
    199                      "Unterminated string literal.",
    200                      "Don't leave me hanging like this!");
    201           break;
    202         }
    203         if (IsCurrentStringTerminator(initial)) {
    204           Advance();  // Skip past last "
    205           break;
    206         } else if (cur_char() == '\n') {
    207           *err_ = Err(LocationRange(location,
    208                                    GetCurrentLocation()),
    209                      "Newline in string constant.");
    210         }
    211         Advance();
    212       }
    213       break;
    214     }
    215 
    216     case Token::OPERATOR:
    217       // Some operators are two characters, some are one.
    218       if (CouldBeTwoCharOperatorBegin(cur_char())) {
    219         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
    220           Advance();
    221       }
    222       Advance();
    223       break;
    224 
    225     case Token::IDENTIFIER:
    226       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
    227         Advance();
    228       break;
    229 
    230     case Token::SCOPER:
    231     case Token::SEPARATOR:
    232       Advance();  // All are one char.
    233       break;
    234 
    235     case Token::COMMENT:
    236       // Eat to EOL.
    237       while (!at_end() && !IsCurrentNewline())
    238         Advance();
    239       break;
    240 
    241     case Token::INVALID:
    242       *err_ = Err(location, "Everything is all messed up",
    243                   "Please insert system disk in drive A: and press any key.");
    244       NOTREACHED();
    245       return;
    246   }
    247 }
    248 
    249 bool Tokenizer::IsCurrentWhitespace() const {
    250   DCHECK(!at_end());
    251   char c = input_[cur_];
    252   // Note that tab (0x09) is illegal.
    253   return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
    254 }
    255 
    256 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
    257   DCHECK(!at_end());
    258   if (cur_char() != quote_char)
    259     return false;
    260 
    261   // Check for escaping. \" is not a string terminator, but \\" is. Count
    262   // the number of preceeding backslashes.
    263   int num_backslashes = 0;
    264   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
    265     num_backslashes++;
    266 
    267   // Even backslashes mean that they were escaping each other and don't count
    268   // as escaping this quote.
    269   return (num_backslashes % 2) == 0;
    270 }
    271 
    272 bool Tokenizer::IsCurrentNewline() const {
    273   return IsNewline(input_, cur_);
    274 }
    275 
    276 void Tokenizer::Advance() {
    277   DCHECK(cur_ < input_.size());
    278   if (IsCurrentNewline()) {
    279     line_number_++;
    280     char_in_line_ = 1;
    281   } else {
    282     char_in_line_++;
    283   }
    284   cur_++;
    285 }
    286 
    287 Location Tokenizer::GetCurrentLocation() const {
    288   return Location(input_file_, line_number_, char_in_line_);
    289 }
    290 
    291 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
    292   std::string help;
    293   if (cur_char() == ';') {
    294     // Semicolon.
    295     help = "Semicolons are not needed, delete this one.";
    296   } else if (cur_char() == '\t') {
    297     // Tab.
    298     help = "You got a tab character in here. Tabs are evil. "
    299            "Convert to spaces.";
    300   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
    301       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
    302     // Different types of comments.
    303     help = "Comments should start with # instead";
    304   } else {
    305     help = "I have no idea what this is.";
    306   }
    307 
    308   return Err(location, "Invalid token.", help);
    309 }
    310