1 # Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc. 2 # 3 # Permission to use, copy, modify, and distribute this software and its 4 # documentation for any purpose with or without fee is hereby granted, 5 # provided that the above copyright notice and this permission notice 6 # appear in all copies. 7 # 8 # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 9 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 11 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 14 # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 16 """Tokenize DNS master file format""" 17 18 import cStringIO 19 import sys 20 21 import dns.exception 22 import dns.name 23 import dns.ttl 24 25 _DELIMITERS = { 26 ' ' : True, 27 '\t' : True, 28 '\n' : True, 29 ';' : True, 30 '(' : True, 31 ')' : True, 32 '"' : True } 33 34 _QUOTING_DELIMITERS = { '"' : True } 35 36 EOF = 0 37 EOL = 1 38 WHITESPACE = 2 39 IDENTIFIER = 3 40 QUOTED_STRING = 4 41 COMMENT = 5 42 DELIMITER = 6 43 44 class UngetBufferFull(dns.exception.DNSException): 45 """Raised when an attempt is made to unget a token when the unget 46 buffer is full.""" 47 pass 48 49 class Token(object): 50 """A DNS master file format token. 51 52 @ivar ttype: The token type 53 @type ttype: int 54 @ivar value: The token value 55 @type value: string 56 @ivar has_escape: Does the token value contain escapes? 57 @type has_escape: bool 58 """ 59 60 def __init__(self, ttype, value='', has_escape=False): 61 """Initialize a token instance. 62 63 @param ttype: The token type 64 @type ttype: int 65 @ivar value: The token value 66 @type value: string 67 @ivar has_escape: Does the token value contain escapes? 68 @type has_escape: bool 69 """ 70 self.ttype = ttype 71 self.value = value 72 self.has_escape = has_escape 73 74 def is_eof(self): 75 return self.ttype == EOF 76 77 def is_eol(self): 78 return self.ttype == EOL 79 80 def is_whitespace(self): 81 return self.ttype == WHITESPACE 82 83 def is_identifier(self): 84 return self.ttype == IDENTIFIER 85 86 def is_quoted_string(self): 87 return self.ttype == QUOTED_STRING 88 89 def is_comment(self): 90 return self.ttype == COMMENT 91 92 def is_delimiter(self): 93 return self.ttype == DELIMITER 94 95 def is_eol_or_eof(self): 96 return (self.ttype == EOL or self.ttype == EOF) 97 98 def __eq__(self, other): 99 if not isinstance(other, Token): 100 return False 101 return (self.ttype == other.ttype and 102 self.value == other.value) 103 104 def __ne__(self, other): 105 if not isinstance(other, Token): 106 return True 107 return (self.ttype != other.ttype or 108 self.value != other.value) 109 110 def __str__(self): 111 return '%d "%s"' % (self.ttype, self.value) 112 113 def unescape(self): 114 if not self.has_escape: 115 return self 116 unescaped = '' 117 l = len(self.value) 118 i = 0 119 while i < l: 120 c = self.value[i] 121 i += 1 122 if c == '\\': 123 if i >= l: 124 raise dns.exception.UnexpectedEnd 125 c = self.value[i] 126 i += 1 127 if c.isdigit(): 128 if i >= l: 129 raise dns.exception.UnexpectedEnd 130 c2 = self.value[i] 131 i += 1 132 if i >= l: 133 raise dns.exception.UnexpectedEnd 134 c3 = self.value[i] 135 i += 1 136 if not (c2.isdigit() and c3.isdigit()): 137 raise dns.exception.SyntaxError 138 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 139 unescaped += c 140 return Token(self.ttype, unescaped) 141 142 # compatibility for old-style tuple tokens 143 144 def __len__(self): 145 return 2 146 147 def __iter__(self): 148 return iter((self.ttype, self.value)) 149 150 def __getitem__(self, i): 151 if i == 0: 152 return self.ttype 153 elif i == 1: 154 return self.value 155 else: 156 raise IndexError 157 158 class Tokenizer(object): 159 """A DNS master file format tokenizer. 160 161 A token is a (type, value) tuple, where I{type} is an int, and 162 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 163 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 164 165 @ivar file: The file to tokenize 166 @type file: file 167 @ivar ungotten_char: The most recently ungotten character, or None. 168 @type ungotten_char: string 169 @ivar ungotten_token: The most recently ungotten token, or None. 170 @type ungotten_token: (int, string) token tuple 171 @ivar multiline: The current multiline level. This value is increased 172 by one every time a '(' delimiter is read, and decreased by one every time 173 a ')' delimiter is read. 174 @type multiline: int 175 @ivar quoting: This variable is true if the tokenizer is currently 176 reading a quoted string. 177 @type quoting: bool 178 @ivar eof: This variable is true if the tokenizer has encountered EOF. 179 @type eof: bool 180 @ivar delimiters: The current delimiter dictionary. 181 @type delimiters: dict 182 @ivar line_number: The current line number 183 @type line_number: int 184 @ivar filename: A filename that will be returned by the L{where} method. 185 @type filename: string 186 """ 187 188 def __init__(self, f=sys.stdin, filename=None): 189 """Initialize a tokenizer instance. 190 191 @param f: The file to tokenize. The default is sys.stdin. 192 This parameter may also be a string, in which case the tokenizer 193 will take its input from the contents of the string. 194 @type f: file or string 195 @param filename: the name of the filename that the L{where} method 196 will return. 197 @type filename: string 198 """ 199 200 if isinstance(f, str): 201 f = cStringIO.StringIO(f) 202 if filename is None: 203 filename = '<string>' 204 else: 205 if filename is None: 206 if f is sys.stdin: 207 filename = '<stdin>' 208 else: 209 filename = '<file>' 210 self.file = f 211 self.ungotten_char = None 212 self.ungotten_token = None 213 self.multiline = 0 214 self.quoting = False 215 self.eof = False 216 self.delimiters = _DELIMITERS 217 self.line_number = 1 218 self.filename = filename 219 220 def _get_char(self): 221 """Read a character from input. 222 @rtype: string 223 """ 224 225 if self.ungotten_char is None: 226 if self.eof: 227 c = '' 228 else: 229 c = self.file.read(1) 230 if c == '': 231 self.eof = True 232 elif c == '\n': 233 self.line_number += 1 234 else: 235 c = self.ungotten_char 236 self.ungotten_char = None 237 return c 238 239 def where(self): 240 """Return the current location in the input. 241 242 @rtype: (string, int) tuple. The first item is the filename of 243 the input, the second is the current line number. 244 """ 245 246 return (self.filename, self.line_number) 247 248 def _unget_char(self, c): 249 """Unget a character. 250 251 The unget buffer for characters is only one character large; it is 252 an error to try to unget a character when the unget buffer is not 253 empty. 254 255 @param c: the character to unget 256 @type c: string 257 @raises UngetBufferFull: there is already an ungotten char 258 """ 259 260 if not self.ungotten_char is None: 261 raise UngetBufferFull 262 self.ungotten_char = c 263 264 def skip_whitespace(self): 265 """Consume input until a non-whitespace character is encountered. 266 267 The non-whitespace character is then ungotten, and the number of 268 whitespace characters consumed is returned. 269 270 If the tokenizer is in multiline mode, then newlines are whitespace. 271 272 @rtype: int 273 """ 274 275 skipped = 0 276 while True: 277 c = self._get_char() 278 if c != ' ' and c != '\t': 279 if (c != '\n') or not self.multiline: 280 self._unget_char(c) 281 return skipped 282 skipped += 1 283 284 def get(self, want_leading = False, want_comment = False): 285 """Get the next token. 286 287 @param want_leading: If True, return a WHITESPACE token if the 288 first character read is whitespace. The default is False. 289 @type want_leading: bool 290 @param want_comment: If True, return a COMMENT token if the 291 first token read is a comment. The default is False. 292 @type want_comment: bool 293 @rtype: Token object 294 @raises dns.exception.UnexpectedEnd: input ended prematurely 295 @raises dns.exception.SyntaxError: input was badly formed 296 """ 297 298 if not self.ungotten_token is None: 299 token = self.ungotten_token 300 self.ungotten_token = None 301 if token.is_whitespace(): 302 if want_leading: 303 return token 304 elif token.is_comment(): 305 if want_comment: 306 return token 307 else: 308 return token 309 skipped = self.skip_whitespace() 310 if want_leading and skipped > 0: 311 return Token(WHITESPACE, ' ') 312 token = '' 313 ttype = IDENTIFIER 314 has_escape = False 315 while True: 316 c = self._get_char() 317 if c == '' or c in self.delimiters: 318 if c == '' and self.quoting: 319 raise dns.exception.UnexpectedEnd 320 if token == '' and ttype != QUOTED_STRING: 321 if c == '(': 322 self.multiline += 1 323 self.skip_whitespace() 324 continue 325 elif c == ')': 326 if not self.multiline > 0: 327 raise dns.exception.SyntaxError 328 self.multiline -= 1 329 self.skip_whitespace() 330 continue 331 elif c == '"': 332 if not self.quoting: 333 self.quoting = True 334 self.delimiters = _QUOTING_DELIMITERS 335 ttype = QUOTED_STRING 336 continue 337 else: 338 self.quoting = False 339 self.delimiters = _DELIMITERS 340 self.skip_whitespace() 341 continue 342 elif c == '\n': 343 return Token(EOL, '\n') 344 elif c == ';': 345 while 1: 346 c = self._get_char() 347 if c == '\n' or c == '': 348 break 349 token += c 350 if want_comment: 351 self._unget_char(c) 352 return Token(COMMENT, token) 353 elif c == '': 354 if self.multiline: 355 raise dns.exception.SyntaxError('unbalanced parentheses') 356 return Token(EOF) 357 elif self.multiline: 358 self.skip_whitespace() 359 token = '' 360 continue 361 else: 362 return Token(EOL, '\n') 363 else: 364 # This code exists in case we ever want a 365 # delimiter to be returned. It never produces 366 # a token currently. 367 token = c 368 ttype = DELIMITER 369 else: 370 self._unget_char(c) 371 break 372 elif self.quoting: 373 if c == '\\': 374 c = self._get_char() 375 if c == '': 376 raise dns.exception.UnexpectedEnd 377 if c.isdigit(): 378 c2 = self._get_char() 379 if c2 == '': 380 raise dns.exception.UnexpectedEnd 381 c3 = self._get_char() 382 if c == '': 383 raise dns.exception.UnexpectedEnd 384 if not (c2.isdigit() and c3.isdigit()): 385 raise dns.exception.SyntaxError 386 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 387 elif c == '\n': 388 raise dns.exception.SyntaxError('newline in quoted string') 389 elif c == '\\': 390 # 391 # It's an escape. Put it and the next character into 392 # the token; it will be checked later for goodness. 393 # 394 token += c 395 has_escape = True 396 c = self._get_char() 397 if c == '' or c == '\n': 398 raise dns.exception.UnexpectedEnd 399 token += c 400 if token == '' and ttype != QUOTED_STRING: 401 if self.multiline: 402 raise dns.exception.SyntaxError('unbalanced parentheses') 403 ttype = EOF 404 return Token(ttype, token, has_escape) 405 406 def unget(self, token): 407 """Unget a token. 408 409 The unget buffer for tokens is only one token large; it is 410 an error to try to unget a token when the unget buffer is not 411 empty. 412 413 @param token: the token to unget 414 @type token: Token object 415 @raises UngetBufferFull: there is already an ungotten token 416 """ 417 418 if not self.ungotten_token is None: 419 raise UngetBufferFull 420 self.ungotten_token = token 421 422 def next(self): 423 """Return the next item in an iteration. 424 @rtype: (int, string) 425 """ 426 427 token = self.get() 428 if token.is_eof(): 429 raise StopIteration 430 return token 431 432 def __iter__(self): 433 return self 434 435 # Helpers 436 437 def get_int(self): 438 """Read the next token and interpret it as an integer. 439 440 @raises dns.exception.SyntaxError: 441 @rtype: int 442 """ 443 444 token = self.get().unescape() 445 if not token.is_identifier(): 446 raise dns.exception.SyntaxError('expecting an identifier') 447 if not token.value.isdigit(): 448 raise dns.exception.SyntaxError('expecting an integer') 449 return int(token.value) 450 451 def get_uint8(self): 452 """Read the next token and interpret it as an 8-bit unsigned 453 integer. 454 455 @raises dns.exception.SyntaxError: 456 @rtype: int 457 """ 458 459 value = self.get_int() 460 if value < 0 or value > 255: 461 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) 462 return value 463 464 def get_uint16(self): 465 """Read the next token and interpret it as a 16-bit unsigned 466 integer. 467 468 @raises dns.exception.SyntaxError: 469 @rtype: int 470 """ 471 472 value = self.get_int() 473 if value < 0 or value > 65535: 474 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) 475 return value 476 477 def get_uint32(self): 478 """Read the next token and interpret it as a 32-bit unsigned 479 integer. 480 481 @raises dns.exception.SyntaxError: 482 @rtype: int 483 """ 484 485 token = self.get().unescape() 486 if not token.is_identifier(): 487 raise dns.exception.SyntaxError('expecting an identifier') 488 if not token.value.isdigit(): 489 raise dns.exception.SyntaxError('expecting an integer') 490 value = long(token.value) 491 if value < 0 or value > 4294967296L: 492 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) 493 return value 494 495 def get_string(self, origin=None): 496 """Read the next token and interpret it as a string. 497 498 @raises dns.exception.SyntaxError: 499 @rtype: string 500 """ 501 502 token = self.get().unescape() 503 if not (token.is_identifier() or token.is_quoted_string()): 504 raise dns.exception.SyntaxError('expecting a string') 505 return token.value 506 507 def get_identifier(self, origin=None): 508 """Read the next token and raise an exception if it is not an identifier. 509 510 @raises dns.exception.SyntaxError: 511 @rtype: string 512 """ 513 514 token = self.get().unescape() 515 if not token.is_identifier(): 516 raise dns.exception.SyntaxError('expecting an identifier') 517 return token.value 518 519 def get_name(self, origin=None): 520 """Read the next token and interpret it as a DNS name. 521 522 @raises dns.exception.SyntaxError: 523 @rtype: dns.name.Name object""" 524 525 token = self.get() 526 if not token.is_identifier(): 527 raise dns.exception.SyntaxError('expecting an identifier') 528 return dns.name.from_text(token.value, origin) 529 530 def get_eol(self): 531 """Read the next token and raise an exception if it isn't EOL or 532 EOF. 533 534 @raises dns.exception.SyntaxError: 535 @rtype: string 536 """ 537 538 token = self.get() 539 if not token.is_eol_or_eof(): 540 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) 541 return token.value 542 543 def get_ttl(self): 544 token = self.get().unescape() 545 if not token.is_identifier(): 546 raise dns.exception.SyntaxError('expecting an identifier') 547 return dns.ttl.from_text(token.value) 548