1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "tools/gn/tokenizer.h" 6 7 #include "base/logging.h" 8 #include "base/strings/string_util.h" 9 #include "tools/gn/input_file.h" 10 11 namespace { 12 13 bool CouldBeTwoCharOperatorBegin(char c) { 14 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || 15 c == '+' || c == '|' || c == '&'; 16 } 17 18 bool CouldBeTwoCharOperatorEnd(char c) { 19 return c == '=' || c == '|' || c == '&'; 20 } 21 22 bool CouldBeOneCharOperator(char c) { 23 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || 24 c == ':' || c == '|' || c == '&' || c == '-'; 25 } 26 27 bool CouldBeOperator(char c) { 28 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); 29 } 30 31 bool IsScoperChar(char c) { 32 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; 33 } 34 35 Token::Type GetSpecificOperatorType(base::StringPiece value) { 36 if (value == "=") 37 return Token::EQUAL; 38 if (value == "+") 39 return Token::PLUS; 40 if (value == "-") 41 return Token::MINUS; 42 if (value == "+=") 43 return Token::PLUS_EQUALS; 44 if (value == "-=") 45 return Token::MINUS_EQUALS; 46 if (value == "==") 47 return Token::EQUAL_EQUAL; 48 if (value == "!=") 49 return Token::NOT_EQUAL; 50 if (value == "<=") 51 return Token::LESS_EQUAL; 52 if (value == ">=") 53 return Token::GREATER_EQUAL; 54 if (value == "<") 55 return Token::LESS_THAN; 56 if (value == ">") 57 return Token::GREATER_THAN; 58 if (value == "&&") 59 return Token::BOOLEAN_AND; 60 if (value == "||") 61 return Token::BOOLEAN_OR; 62 if (value == "!") 63 return Token::BANG; 64 if (value == ".") 65 return Token::DOT; 66 return Token::INVALID; 67 } 68 69 } // namespace 70 71 Tokenizer::Tokenizer(const InputFile* input_file, Err* err) 72 : input_file_(input_file), 73 input_(input_file->contents()), 74 err_(err), 75 cur_(0), 76 line_number_(1), 77 char_in_line_(1) { 78 } 79 80 Tokenizer::~Tokenizer() { 81 } 82 83 // static 84 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { 85 Tokenizer t(input_file, err); 86 return t.Run(); 87 } 88 89 std::vector<Token> Tokenizer::Run() { 90 DCHECK(tokens_.empty()); 91 while (!done()) { 92 AdvanceToNextToken(); 93 if (done()) 94 break; 95 Location location = GetCurrentLocation(); 96 97 Token::Type type = ClassifyCurrent(); 98 if (type == Token::INVALID) { 99 *err_ = GetErrorForInvalidToken(location); 100 break; 101 } 102 size_t token_begin = cur_; 103 AdvanceToEndOfToken(location, type); 104 if (has_error()) 105 break; 106 size_t token_end = cur_; 107 108 base::StringPiece token_value(&input_.data()[token_begin], 109 token_end - token_begin); 110 111 if (type == Token::UNCLASSIFIED_OPERATOR) { 112 type = GetSpecificOperatorType(token_value); 113 } else if (type == Token::IDENTIFIER) { 114 if (token_value == "if") 115 type = Token::IF; 116 else if (token_value == "else") 117 type = Token::ELSE; 118 else if (token_value == "true") 119 type = Token::TRUE_TOKEN; 120 else if (token_value == "false") 121 type = Token::FALSE_TOKEN; 122 } else if (type == Token::UNCLASSIFIED_COMMENT) { 123 if (AtStartOfLine(token_begin) && 124 // If it's a standalone comment, but is a continuation of a comment on 125 // a previous line, then instead make it a continued suffix comment. 126 (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT || 127 tokens_.back().location().line_number() + 1 != 128 location.line_number() || 129 tokens_.back().location().char_offset() != location.char_offset())) { 130 type = Token::LINE_COMMENT; 131 Advance(); // The current \n. 132 // If this comment is separated from the next syntax element, then we 133 // want to tag it as a block comment. This will become a standalone 134 // statement at the parser level to keep this comment separate, rather 135 // than attached to the subsequent statement. 136 while (!at_end() && IsCurrentWhitespace()) { 137 if (IsCurrentNewline()) { 138 type = Token::BLOCK_COMMENT; 139 break; 140 } 141 Advance(); 142 } 143 } else { 144 type = Token::SUFFIX_COMMENT; 145 } 146 } 147 148 tokens_.push_back(Token(location, type, token_value)); 149 } 150 if (err_->has_error()) 151 tokens_.clear(); 152 return tokens_; 153 } 154 155 // static 156 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { 157 DCHECK_GT(n, 0); 158 159 if (n == 1) 160 return 0; 161 162 int cur_line = 1; 163 size_t cur_byte = 0; 164 while (cur_byte < buf.size()) { 165 if (IsNewline(buf, cur_byte)) { 166 cur_line++; 167 if (cur_line == n) 168 return cur_byte + 1; 169 } 170 cur_byte++; 171 } 172 return static_cast<size_t>(-1); 173 } 174 175 // static 176 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { 177 DCHECK(offset < buffer.size()); 178 // We may need more logic here to handle different line ending styles. 179 return buffer[offset] == '\n'; 180 } 181 182 183 void Tokenizer::AdvanceToNextToken() { 184 while (!at_end() && IsCurrentWhitespace()) 185 Advance(); 186 } 187 188 Token::Type Tokenizer::ClassifyCurrent() const { 189 DCHECK(!at_end()); 190 char next_char = cur_char(); 191 if (IsAsciiDigit(next_char)) 192 return Token::INTEGER; 193 if (next_char == '"') 194 return Token::STRING; 195 196 // Note: '-' handled specially below. 197 if (next_char != '-' && CouldBeOperator(next_char)) 198 return Token::UNCLASSIFIED_OPERATOR; 199 200 if (IsIdentifierFirstChar(next_char)) 201 return Token::IDENTIFIER; 202 203 if (next_char == '[') 204 return Token::LEFT_BRACKET; 205 if (next_char == ']') 206 return Token::RIGHT_BRACKET; 207 if (next_char == '(') 208 return Token::LEFT_PAREN; 209 if (next_char == ')') 210 return Token::RIGHT_PAREN; 211 if (next_char == '{') 212 return Token::LEFT_BRACE; 213 if (next_char == '}') 214 return Token::RIGHT_BRACE; 215 216 if (next_char == '.') 217 return Token::DOT; 218 if (next_char == ',') 219 return Token::COMMA; 220 221 if (next_char == '#') 222 return Token::UNCLASSIFIED_COMMENT; 223 224 // For the case of '-' differentiate between a negative number and anything 225 // else. 226 if (next_char == '-') { 227 if (!CanIncrement()) 228 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of 229 // file. 230 char following_char = input_[cur_ + 1]; 231 if (IsAsciiDigit(following_char)) 232 return Token::INTEGER; 233 return Token::UNCLASSIFIED_OPERATOR; 234 } 235 236 return Token::INVALID; 237 } 238 239 void Tokenizer::AdvanceToEndOfToken(const Location& location, 240 Token::Type type) { 241 switch (type) { 242 case Token::INTEGER: 243 do { 244 Advance(); 245 } while (!at_end() && IsAsciiDigit(cur_char())); 246 if (!at_end()) { 247 // Require the char after a number to be some kind of space, scope, 248 // or operator. 249 char c = cur_char(); 250 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && 251 !IsScoperChar(c) && c != ',') { 252 *err_ = Err(GetCurrentLocation(), 253 "This is not a valid number.", 254 "Learn to count."); 255 // Highlight the number. 256 err_->AppendRange(LocationRange(location, GetCurrentLocation())); 257 } 258 } 259 break; 260 261 case Token::STRING: { 262 char initial = cur_char(); 263 Advance(); // Advance past initial " 264 for (;;) { 265 if (at_end()) { 266 *err_ = Err(LocationRange(location, GetCurrentLocation()), 267 "Unterminated string literal.", 268 "Don't leave me hanging like this!"); 269 break; 270 } 271 if (IsCurrentStringTerminator(initial)) { 272 Advance(); // Skip past last " 273 break; 274 } else if (cur_char() == '\n') { 275 *err_ = Err(LocationRange(location, GetCurrentLocation()), 276 "Newline in string constant."); 277 } 278 Advance(); 279 } 280 break; 281 } 282 283 case Token::UNCLASSIFIED_OPERATOR: 284 // Some operators are two characters, some are one. 285 if (CouldBeTwoCharOperatorBegin(cur_char())) { 286 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) 287 Advance(); 288 } 289 Advance(); 290 break; 291 292 case Token::IDENTIFIER: 293 while (!at_end() && IsIdentifierContinuingChar(cur_char())) 294 Advance(); 295 break; 296 297 case Token::LEFT_BRACKET: 298 case Token::RIGHT_BRACKET: 299 case Token::LEFT_BRACE: 300 case Token::RIGHT_BRACE: 301 case Token::LEFT_PAREN: 302 case Token::RIGHT_PAREN: 303 case Token::DOT: 304 case Token::COMMA: 305 Advance(); // All are one char. 306 break; 307 308 case Token::UNCLASSIFIED_COMMENT: 309 // Eat to EOL. 310 while (!at_end() && !IsCurrentNewline()) 311 Advance(); 312 break; 313 314 case Token::INVALID: 315 default: 316 *err_ = Err(location, "Everything is all messed up", 317 "Please insert system disk in drive A: and press any key."); 318 NOTREACHED(); 319 return; 320 } 321 } 322 323 bool Tokenizer::AtStartOfLine(size_t location) const { 324 while (location > 0) { 325 --location; 326 char c = input_[location]; 327 if (c == '\n') 328 return true; 329 if (c != ' ') 330 return false; 331 } 332 return true; 333 } 334 335 bool Tokenizer::IsCurrentWhitespace() const { 336 DCHECK(!at_end()); 337 char c = input_[cur_]; 338 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal. 339 return c == 0x0A || c == 0x0D || c == 0x20; 340 } 341 342 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { 343 DCHECK(!at_end()); 344 if (cur_char() != quote_char) 345 return false; 346 347 // Check for escaping. \" is not a string terminator, but \\" is. Count 348 // the number of preceeding backslashes. 349 int num_backslashes = 0; 350 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) 351 num_backslashes++; 352 353 // Even backslashes mean that they were escaping each other and don't count 354 // as escaping this quote. 355 return (num_backslashes % 2) == 0; 356 } 357 358 bool Tokenizer::IsCurrentNewline() const { 359 return IsNewline(input_, cur_); 360 } 361 362 void Tokenizer::Advance() { 363 DCHECK(cur_ < input_.size()); 364 if (IsCurrentNewline()) { 365 line_number_++; 366 char_in_line_ = 1; 367 } else { 368 char_in_line_++; 369 } 370 cur_++; 371 } 372 373 Location Tokenizer::GetCurrentLocation() const { 374 return Location( 375 input_file_, line_number_, char_in_line_, static_cast<int>(cur_)); 376 } 377 378 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { 379 std::string help; 380 if (cur_char() == ';') { 381 // Semicolon. 382 help = "Semicolons are not needed, delete this one."; 383 } else if (cur_char() == '\t') { 384 // Tab. 385 help = "You got a tab character in here. Tabs are evil. " 386 "Convert to spaces."; 387 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && 388 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { 389 // Different types of comments. 390 help = "Comments should start with # instead"; 391 } else { 392 help = "I have no idea what this is."; 393 } 394 395 return Err(location, "Invalid token.", help); 396 } 397