1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "tools/gn/tokenizer.h" 6 7 #include "base/logging.h" 8 #include "tools/gn/input_file.h" 9 10 namespace { 11 12 bool CouldBeTwoCharOperatorBegin(char c) { 13 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || 14 c == '+' || c == '|' || c == '&'; 15 } 16 17 bool CouldBeTwoCharOperatorEnd(char c) { 18 return c == '=' || c == '|' || c == '&'; 19 } 20 21 bool CouldBeOneCharOperator(char c) { 22 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || 23 c == ':' || c == '|' || c == '&' || c == '-'; 24 } 25 26 bool CouldBeOperator(char c) { 27 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); 28 } 29 30 bool IsScoperChar(char c) { 31 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; 32 } 33 34 Token::Type GetSpecificOperatorType(base::StringPiece value) { 35 if (value == "=") 36 return Token::EQUAL; 37 if (value == "+") 38 return Token::PLUS; 39 if (value == "-") 40 return Token::MINUS; 41 if (value == "+=") 42 return Token::PLUS_EQUALS; 43 if (value == "-=") 44 return Token::MINUS_EQUALS; 45 if (value == "==") 46 return Token::EQUAL_EQUAL; 47 if (value == "!=") 48 return Token::NOT_EQUAL; 49 if (value == "<=") 50 return Token::LESS_EQUAL; 51 if (value == ">=") 52 return Token::GREATER_EQUAL; 53 if (value == "<") 54 return Token::LESS_THAN; 55 if (value == ">") 56 return Token::GREATER_THAN; 57 if (value == "&&") 58 return Token::BOOLEAN_AND; 59 if (value == "||") 60 return Token::BOOLEAN_OR; 61 if (value == "!") 62 return Token::BANG; 63 if (value == ".") 64 return Token::DOT; 65 return Token::INVALID; 66 } 67 68 } // namespace 69 70 Tokenizer::Tokenizer(const InputFile* input_file, Err* err) 71 : input_file_(input_file), 72 input_(input_file->contents()), 73 err_(err), 74 cur_(0), 75 line_number_(1), 76 char_in_line_(1) { 77 } 78 79 Tokenizer::~Tokenizer() { 80 } 81 82 // static 83 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { 84 Tokenizer t(input_file, err); 85 return t.Run(); 86 } 87 88 std::vector<Token> Tokenizer::Run() { 89 DCHECK(tokens_.empty()); 90 while (!done()) { 91 AdvanceToNextToken(); 92 if (done()) 93 break; 94 Location location = GetCurrentLocation(); 95 96 Token::Type type = ClassifyCurrent(); 97 if (type == Token::INVALID) { 98 *err_ = GetErrorForInvalidToken(location); 99 break; 100 } 101 size_t token_begin = cur_; 102 AdvanceToEndOfToken(location, type); 103 if (has_error()) 104 break; 105 size_t token_end = cur_; 106 107 base::StringPiece token_value(&input_.data()[token_begin], 108 token_end - token_begin); 109 110 if (type == Token::UNCLASSIFIED_OPERATOR) 111 type = GetSpecificOperatorType(token_value); 112 if (type == Token::IDENTIFIER) { 113 if (token_value == "if") 114 type = Token::IF; 115 else if (token_value == "else") 116 type = Token::ELSE; 117 else if (token_value == "true") 118 type = Token::TRUE_TOKEN; 119 else if (token_value == "false") 120 type = Token::FALSE_TOKEN; 121 } 122 123 // TODO(brettw) This just strips comments from the token stream. This 124 // is probably wrong, they should be removed at a later stage so we can 125 // do things like rewrite the file. But this makes the parser simpler and 126 // is OK for now. 127 if (type != Token::COMMENT) 128 tokens_.push_back(Token(location, type, token_value)); 129 } 130 if (err_->has_error()) 131 tokens_.clear(); 132 return tokens_; 133 } 134 135 // static 136 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { 137 int cur_line = 1; 138 size_t cur_byte = 0; 139 140 DCHECK(n > 0); 141 142 if (n == 1) 143 return 0; 144 145 while (cur_byte < buf.size()) { 146 if (IsNewline(buf, cur_byte)) { 147 cur_line++; 148 if (cur_line == n) 149 return cur_byte + 1; 150 } 151 cur_byte++; 152 } 153 return -1; 154 } 155 156 // static 157 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { 158 DCHECK(offset < buffer.size()); 159 // We may need more logic here to handle different line ending styles. 160 return buffer[offset] == '\n'; 161 } 162 163 164 void Tokenizer::AdvanceToNextToken() { 165 while (!at_end() && IsCurrentWhitespace()) 166 Advance(); 167 } 168 169 Token::Type Tokenizer::ClassifyCurrent() const { 170 DCHECK(!at_end()); 171 char next_char = cur_char(); 172 if (IsAsciiDigit(next_char)) 173 return Token::INTEGER; 174 if (next_char == '"') 175 return Token::STRING; 176 177 // Note: '-' handled specially below. 178 if (next_char != '-' && CouldBeOperator(next_char)) 179 return Token::UNCLASSIFIED_OPERATOR; 180 181 if (IsIdentifierFirstChar(next_char)) 182 return Token::IDENTIFIER; 183 184 if (next_char == '[') 185 return Token::LEFT_BRACKET; 186 if (next_char == ']') 187 return Token::RIGHT_BRACKET; 188 if (next_char == '(') 189 return Token::LEFT_PAREN; 190 if (next_char == ')') 191 return Token::RIGHT_PAREN; 192 if (next_char == '{') 193 return Token::LEFT_BRACE; 194 if (next_char == '}') 195 return Token::RIGHT_BRACE; 196 197 if (next_char == '.') 198 return Token::DOT; 199 if (next_char == ',') 200 return Token::COMMA; 201 202 if (next_char == '#') 203 return Token::COMMENT; 204 205 // For the case of '-' differentiate between a negative number and anything 206 // else. 207 if (next_char == '-') { 208 if (!CanIncrement()) 209 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of 210 // file. 211 char following_char = input_[cur_ + 1]; 212 if (IsAsciiDigit(following_char)) 213 return Token::INTEGER; 214 return Token::UNCLASSIFIED_OPERATOR; 215 } 216 217 return Token::INVALID; 218 } 219 220 void Tokenizer::AdvanceToEndOfToken(const Location& location, 221 Token::Type type) { 222 switch (type) { 223 case Token::INTEGER: 224 do { 225 Advance(); 226 } while (!at_end() && IsAsciiDigit(cur_char())); 227 if (!at_end()) { 228 // Require the char after a number to be some kind of space, scope, 229 // or operator. 230 char c = cur_char(); 231 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && 232 !IsScoperChar(c) && c != ',') { 233 *err_ = Err(GetCurrentLocation(), 234 "This is not a valid number.", 235 "Learn to count."); 236 // Highlight the number. 237 err_->AppendRange(LocationRange(location, GetCurrentLocation())); 238 } 239 } 240 break; 241 242 case Token::STRING: { 243 char initial = cur_char(); 244 Advance(); // Advance past initial " 245 for (;;) { 246 if (at_end()) { 247 *err_ = Err(LocationRange(location, GetCurrentLocation()), 248 "Unterminated string literal.", 249 "Don't leave me hanging like this!"); 250 break; 251 } 252 if (IsCurrentStringTerminator(initial)) { 253 Advance(); // Skip past last " 254 break; 255 } else if (cur_char() == '\n') { 256 *err_ = Err(LocationRange(location, GetCurrentLocation()), 257 "Newline in string constant."); 258 } 259 Advance(); 260 } 261 break; 262 } 263 264 case Token::UNCLASSIFIED_OPERATOR: 265 // Some operators are two characters, some are one. 266 if (CouldBeTwoCharOperatorBegin(cur_char())) { 267 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) 268 Advance(); 269 } 270 Advance(); 271 break; 272 273 case Token::IDENTIFIER: 274 while (!at_end() && IsIdentifierContinuingChar(cur_char())) 275 Advance(); 276 break; 277 278 case Token::LEFT_BRACKET: 279 case Token::RIGHT_BRACKET: 280 case Token::LEFT_BRACE: 281 case Token::RIGHT_BRACE: 282 case Token::LEFT_PAREN: 283 case Token::RIGHT_PAREN: 284 case Token::DOT: 285 case Token::COMMA: 286 Advance(); // All are one char. 287 break; 288 289 case Token::COMMENT: 290 // Eat to EOL. 291 while (!at_end() && !IsCurrentNewline()) 292 Advance(); 293 break; 294 295 case Token::INVALID: 296 default: 297 *err_ = Err(location, "Everything is all messed up", 298 "Please insert system disk in drive A: and press any key."); 299 NOTREACHED(); 300 return; 301 } 302 } 303 304 bool Tokenizer::IsCurrentWhitespace() const { 305 DCHECK(!at_end()); 306 char c = input_[cur_]; 307 // Note that tab (0x09) is illegal. 308 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; 309 } 310 311 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { 312 DCHECK(!at_end()); 313 if (cur_char() != quote_char) 314 return false; 315 316 // Check for escaping. \" is not a string terminator, but \\" is. Count 317 // the number of preceeding backslashes. 318 int num_backslashes = 0; 319 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) 320 num_backslashes++; 321 322 // Even backslashes mean that they were escaping each other and don't count 323 // as escaping this quote. 324 return (num_backslashes % 2) == 0; 325 } 326 327 bool Tokenizer::IsCurrentNewline() const { 328 return IsNewline(input_, cur_); 329 } 330 331 void Tokenizer::Advance() { 332 DCHECK(cur_ < input_.size()); 333 if (IsCurrentNewline()) { 334 line_number_++; 335 char_in_line_ = 1; 336 } else { 337 char_in_line_++; 338 } 339 cur_++; 340 } 341 342 Location Tokenizer::GetCurrentLocation() const { 343 return Location(input_file_, line_number_, char_in_line_); 344 } 345 346 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { 347 std::string help; 348 if (cur_char() == ';') { 349 // Semicolon. 350 help = "Semicolons are not needed, delete this one."; 351 } else if (cur_char() == '\t') { 352 // Tab. 353 help = "You got a tab character in here. Tabs are evil. " 354 "Convert to spaces."; 355 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && 356 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { 357 // Different types of comments. 358 help = "Comments should start with # instead"; 359 } else { 360 help = "I have no idea what this is."; 361 } 362 363 return Err(location, "Invalid token.", help); 364 } 365