Home | History | Annotate | Download | only in dns
      1 # Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
      2 #
      3 # Permission to use, copy, modify, and distribute this software and its
      4 # documentation for any purpose with or without fee is hereby granted,
      5 # provided that the above copyright notice and this permission notice
      6 # appear in all copies.
      7 #
      8 # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
      9 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
     11 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     13 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
     14 # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15 
     16 """Tokenize DNS master file format"""
     17 
     18 import cStringIO
     19 import sys
     20 
     21 import dns.exception
     22 import dns.name
     23 import dns.ttl
     24 
     25 _DELIMITERS = {
     26     ' ' : True,
     27     '\t' : True,
     28     '\n' : True,
     29     ';' : True,
     30     '(' : True,
     31     ')' : True,
     32     '"' : True }
     33 
     34 _QUOTING_DELIMITERS = { '"' : True }
     35 
     36 EOF = 0
     37 EOL = 1
     38 WHITESPACE = 2
     39 IDENTIFIER = 3
     40 QUOTED_STRING = 4
     41 COMMENT = 5
     42 DELIMITER = 6
     43 
     44 class UngetBufferFull(dns.exception.DNSException):
     45     """Raised when an attempt is made to unget a token when the unget
     46     buffer is full."""
     47     pass
     48 
     49 class Token(object):
     50     """A DNS master file format token.
     51 
     52     @ivar ttype: The token type
     53     @type ttype: int
     54     @ivar value: The token value
     55     @type value: string
     56     @ivar has_escape: Does the token value contain escapes?
     57     @type has_escape: bool
     58     """
     59 
     60     def __init__(self, ttype, value='', has_escape=False):
     61         """Initialize a token instance.
     62 
     63         @param ttype: The token type
     64         @type ttype: int
     65         @ivar value: The token value
     66         @type value: string
     67         @ivar has_escape: Does the token value contain escapes?
     68         @type has_escape: bool
     69         """
     70         self.ttype = ttype
     71         self.value = value
     72         self.has_escape = has_escape
     73 
     74     def is_eof(self):
     75         return self.ttype == EOF
     76 
     77     def is_eol(self):
     78         return self.ttype == EOL
     79 
     80     def is_whitespace(self):
     81         return self.ttype == WHITESPACE
     82 
     83     def is_identifier(self):
     84         return self.ttype == IDENTIFIER
     85 
     86     def is_quoted_string(self):
     87         return self.ttype == QUOTED_STRING
     88 
     89     def is_comment(self):
     90         return self.ttype == COMMENT
     91 
     92     def is_delimiter(self):
     93         return self.ttype == DELIMITER
     94 
     95     def is_eol_or_eof(self):
     96         return (self.ttype == EOL or self.ttype == EOF)
     97 
     98     def __eq__(self, other):
     99         if not isinstance(other, Token):
    100             return False
    101         return (self.ttype == other.ttype and
    102                 self.value == other.value)
    103 
    104     def __ne__(self, other):
    105         if not isinstance(other, Token):
    106             return True
    107         return (self.ttype != other.ttype or
    108                 self.value != other.value)
    109 
    110     def __str__(self):
    111         return '%d "%s"' % (self.ttype, self.value)
    112 
    113     def unescape(self):
    114         if not self.has_escape:
    115             return self
    116         unescaped = ''
    117         l = len(self.value)
    118         i = 0
    119         while i < l:
    120             c = self.value[i]
    121             i += 1
    122             if c == '\\':
    123                 if i >= l:
    124                     raise dns.exception.UnexpectedEnd
    125                 c = self.value[i]
    126                 i += 1
    127                 if c.isdigit():
    128                     if i >= l:
    129                         raise dns.exception.UnexpectedEnd
    130                     c2 = self.value[i]
    131                     i += 1
    132                     if i >= l:
    133                         raise dns.exception.UnexpectedEnd
    134                     c3 = self.value[i]
    135                     i += 1
    136                     if not (c2.isdigit() and c3.isdigit()):
    137                         raise dns.exception.SyntaxError
    138                     c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
    139             unescaped += c
    140         return Token(self.ttype, unescaped)
    141 
    142     # compatibility for old-style tuple tokens
    143 
    144     def __len__(self):
    145         return 2
    146 
    147     def __iter__(self):
    148         return iter((self.ttype, self.value))
    149 
    150     def __getitem__(self, i):
    151         if i == 0:
    152             return self.ttype
    153         elif i == 1:
    154             return self.value
    155         else:
    156             raise IndexError
    157 
    158 class Tokenizer(object):
    159     """A DNS master file format tokenizer.
    160 
    161     A token is a (type, value) tuple, where I{type} is an int, and
    162     I{value} is a string.  The valid types are EOF, EOL, WHITESPACE,
    163     IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
    164 
    165     @ivar file: The file to tokenize
    166     @type file: file
    167     @ivar ungotten_char: The most recently ungotten character, or None.
    168     @type ungotten_char: string
    169     @ivar ungotten_token: The most recently ungotten token, or None.
    170     @type ungotten_token: (int, string) token tuple
    171     @ivar multiline: The current multiline level.  This value is increased
    172     by one every time a '(' delimiter is read, and decreased by one every time
    173     a ')' delimiter is read.
    174     @type multiline: int
    175     @ivar quoting: This variable is true if the tokenizer is currently
    176     reading a quoted string.
    177     @type quoting: bool
    178     @ivar eof: This variable is true if the tokenizer has encountered EOF.
    179     @type eof: bool
    180     @ivar delimiters: The current delimiter dictionary.
    181     @type delimiters: dict
    182     @ivar line_number: The current line number
    183     @type line_number: int
    184     @ivar filename: A filename that will be returned by the L{where} method.
    185     @type filename: string
    186     """
    187 
    188     def __init__(self, f=sys.stdin, filename=None):
    189         """Initialize a tokenizer instance.
    190 
    191         @param f: The file to tokenize.  The default is sys.stdin.
    192         This parameter may also be a string, in which case the tokenizer
    193         will take its input from the contents of the string.
    194         @type f: file or string
    195         @param filename: the name of the filename that the L{where} method
    196         will return.
    197         @type filename: string
    198         """
    199 
    200         if isinstance(f, str):
    201             f = cStringIO.StringIO(f)
    202             if filename is None:
    203                 filename = '<string>'
    204         else:
    205             if filename is None:
    206                 if f is sys.stdin:
    207                     filename = '<stdin>'
    208                 else:
    209                     filename = '<file>'
    210         self.file = f
    211         self.ungotten_char = None
    212         self.ungotten_token = None
    213         self.multiline = 0
    214         self.quoting = False
    215         self.eof = False
    216         self.delimiters = _DELIMITERS
    217         self.line_number = 1
    218         self.filename = filename
    219 
    220     def _get_char(self):
    221         """Read a character from input.
    222         @rtype: string
    223         """
    224 
    225         if self.ungotten_char is None:
    226             if self.eof:
    227                 c = ''
    228             else:
    229                 c = self.file.read(1)
    230                 if c == '':
    231                     self.eof = True
    232                 elif c == '\n':
    233                     self.line_number += 1
    234         else:
    235             c = self.ungotten_char
    236             self.ungotten_char = None
    237         return c
    238 
    239     def where(self):
    240         """Return the current location in the input.
    241 
    242         @rtype: (string, int) tuple.  The first item is the filename of
    243         the input, the second is the current line number.
    244         """
    245 
    246         return (self.filename, self.line_number)
    247 
    248     def _unget_char(self, c):
    249         """Unget a character.
    250 
    251         The unget buffer for characters is only one character large; it is
    252         an error to try to unget a character when the unget buffer is not
    253         empty.
    254 
    255         @param c: the character to unget
    256         @type c: string
    257         @raises UngetBufferFull: there is already an ungotten char
    258         """
    259 
    260         if not self.ungotten_char is None:
    261             raise UngetBufferFull
    262         self.ungotten_char = c
    263 
    264     def skip_whitespace(self):
    265         """Consume input until a non-whitespace character is encountered.
    266 
    267         The non-whitespace character is then ungotten, and the number of
    268         whitespace characters consumed is returned.
    269 
    270         If the tokenizer is in multiline mode, then newlines are whitespace.
    271 
    272         @rtype: int
    273         """
    274 
    275         skipped = 0
    276         while True:
    277             c = self._get_char()
    278             if c != ' ' and c != '\t':
    279                 if (c != '\n') or not self.multiline:
    280                     self._unget_char(c)
    281                     return skipped
    282             skipped += 1
    283 
    284     def get(self, want_leading = False, want_comment = False):
    285         """Get the next token.
    286 
    287         @param want_leading: If True, return a WHITESPACE token if the
    288         first character read is whitespace.  The default is False.
    289         @type want_leading: bool
    290         @param want_comment: If True, return a COMMENT token if the
    291         first token read is a comment.  The default is False.
    292         @type want_comment: bool
    293         @rtype: Token object
    294         @raises dns.exception.UnexpectedEnd: input ended prematurely
    295         @raises dns.exception.SyntaxError: input was badly formed
    296         """
    297 
    298         if not self.ungotten_token is None:
    299             token = self.ungotten_token
    300             self.ungotten_token = None
    301             if token.is_whitespace():
    302                 if want_leading:
    303                     return token
    304             elif token.is_comment():
    305                 if want_comment:
    306                     return token
    307             else:
    308                 return token
    309         skipped = self.skip_whitespace()
    310         if want_leading and skipped > 0:
    311             return Token(WHITESPACE, ' ')
    312         token = ''
    313         ttype = IDENTIFIER
    314         has_escape = False
    315         while True:
    316             c = self._get_char()
    317             if c == '' or c in self.delimiters:
    318                 if c == '' and self.quoting:
    319                     raise dns.exception.UnexpectedEnd
    320                 if token == '' and ttype != QUOTED_STRING:
    321                     if c == '(':
    322                         self.multiline += 1
    323                         self.skip_whitespace()
    324                         continue
    325                     elif c == ')':
    326                         if not self.multiline > 0:
    327                             raise dns.exception.SyntaxError
    328                         self.multiline -= 1
    329                         self.skip_whitespace()
    330                         continue
    331                     elif c == '"':
    332                         if not self.quoting:
    333                             self.quoting = True
    334                             self.delimiters = _QUOTING_DELIMITERS
    335                             ttype = QUOTED_STRING
    336                             continue
    337                         else:
    338                             self.quoting = False
    339                             self.delimiters = _DELIMITERS
    340                             self.skip_whitespace()
    341                             continue
    342                     elif c == '\n':
    343                         return Token(EOL, '\n')
    344                     elif c == ';':
    345                         while 1:
    346                             c = self._get_char()
    347                             if c == '\n' or c == '':
    348                                 break
    349                             token += c
    350                         if want_comment:
    351                             self._unget_char(c)
    352                             return Token(COMMENT, token)
    353                         elif c == '':
    354                             if self.multiline:
    355                                 raise dns.exception.SyntaxError('unbalanced parentheses')
    356                             return Token(EOF)
    357                         elif self.multiline:
    358                             self.skip_whitespace()
    359                             token = ''
    360                             continue
    361                         else:
    362                             return Token(EOL, '\n')
    363                     else:
    364                         # This code exists in case we ever want a
    365                         # delimiter to be returned.  It never produces
    366                         # a token currently.
    367                         token = c
    368                         ttype = DELIMITER
    369                 else:
    370                     self._unget_char(c)
    371                 break
    372             elif self.quoting:
    373                 if c == '\\':
    374                     c = self._get_char()
    375                     if c == '':
    376                         raise dns.exception.UnexpectedEnd
    377                     if c.isdigit():
    378                         c2 = self._get_char()
    379                         if c2 == '':
    380                             raise dns.exception.UnexpectedEnd
    381                         c3 = self._get_char()
    382                         if c == '':
    383                             raise dns.exception.UnexpectedEnd
    384                         if not (c2.isdigit() and c3.isdigit()):
    385                             raise dns.exception.SyntaxError
    386                         c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
    387                 elif c == '\n':
    388                     raise dns.exception.SyntaxError('newline in quoted string')
    389             elif c == '\\':
    390                 #
    391                 # It's an escape.  Put it and the next character into
    392                 # the token; it will be checked later for goodness.
    393                 #
    394                 token += c
    395                 has_escape = True
    396                 c = self._get_char()
    397                 if c == '' or c == '\n':
    398                     raise dns.exception.UnexpectedEnd
    399             token += c
    400         if token == '' and ttype != QUOTED_STRING:
    401             if self.multiline:
    402                 raise dns.exception.SyntaxError('unbalanced parentheses')
    403             ttype = EOF
    404         return Token(ttype, token, has_escape)
    405 
    406     def unget(self, token):
    407         """Unget a token.
    408 
    409         The unget buffer for tokens is only one token large; it is
    410         an error to try to unget a token when the unget buffer is not
    411         empty.
    412 
    413         @param token: the token to unget
    414         @type token: Token object
    415         @raises UngetBufferFull: there is already an ungotten token
    416         """
    417 
    418         if not self.ungotten_token is None:
    419             raise UngetBufferFull
    420         self.ungotten_token = token
    421 
    422     def next(self):
    423         """Return the next item in an iteration.
    424         @rtype: (int, string)
    425         """
    426 
    427         token = self.get()
    428         if token.is_eof():
    429             raise StopIteration
    430         return token
    431 
    432     def __iter__(self):
    433         return self
    434 
    435     # Helpers
    436 
    437     def get_int(self):
    438         """Read the next token and interpret it as an integer.
    439 
    440         @raises dns.exception.SyntaxError:
    441         @rtype: int
    442         """
    443 
    444         token = self.get().unescape()
    445         if not token.is_identifier():
    446             raise dns.exception.SyntaxError('expecting an identifier')
    447         if not token.value.isdigit():
    448             raise dns.exception.SyntaxError('expecting an integer')
    449         return int(token.value)
    450 
    451     def get_uint8(self):
    452         """Read the next token and interpret it as an 8-bit unsigned
    453         integer.
    454 
    455         @raises dns.exception.SyntaxError:
    456         @rtype: int
    457         """
    458 
    459         value = self.get_int()
    460         if value < 0 or value > 255:
    461             raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
    462         return value
    463 
    464     def get_uint16(self):
    465         """Read the next token and interpret it as a 16-bit unsigned
    466         integer.
    467 
    468         @raises dns.exception.SyntaxError:
    469         @rtype: int
    470         """
    471 
    472         value = self.get_int()
    473         if value < 0 or value > 65535:
    474             raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
    475         return value
    476 
    477     def get_uint32(self):
    478         """Read the next token and interpret it as a 32-bit unsigned
    479         integer.
    480 
    481         @raises dns.exception.SyntaxError:
    482         @rtype: int
    483         """
    484 
    485         token = self.get().unescape()
    486         if not token.is_identifier():
    487             raise dns.exception.SyntaxError('expecting an identifier')
    488         if not token.value.isdigit():
    489             raise dns.exception.SyntaxError('expecting an integer')
    490         value = long(token.value)
    491         if value < 0 or value > 4294967296L:
    492             raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
    493         return value
    494 
    495     def get_string(self, origin=None):
    496         """Read the next token and interpret it as a string.
    497 
    498         @raises dns.exception.SyntaxError:
    499         @rtype: string
    500         """
    501 
    502         token = self.get().unescape()
    503         if not (token.is_identifier() or token.is_quoted_string()):
    504             raise dns.exception.SyntaxError('expecting a string')
    505         return token.value
    506 
    507     def get_identifier(self, origin=None):
    508         """Read the next token and raise an exception if it is not an identifier.
    509 
    510         @raises dns.exception.SyntaxError:
    511         @rtype: string
    512         """
    513 
    514         token = self.get().unescape()
    515         if not token.is_identifier():
    516             raise dns.exception.SyntaxError('expecting an identifier')
    517         return token.value
    518 
    519     def get_name(self, origin=None):
    520         """Read the next token and interpret it as a DNS name.
    521 
    522         @raises dns.exception.SyntaxError:
    523         @rtype: dns.name.Name object"""
    524 
    525         token = self.get()
    526         if not token.is_identifier():
    527             raise dns.exception.SyntaxError('expecting an identifier')
    528         return dns.name.from_text(token.value, origin)
    529 
    530     def get_eol(self):
    531         """Read the next token and raise an exception if it isn't EOL or
    532         EOF.
    533 
    534         @raises dns.exception.SyntaxError:
    535         @rtype: string
    536         """
    537 
    538         token = self.get()
    539         if not token.is_eol_or_eof():
    540             raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
    541         return token.value
    542 
    543     def get_ttl(self):
    544         token = self.get().unescape()
    545         if not token.is_identifier():
    546             raise dns.exception.SyntaxError('expecting an identifier')
    547         return dns.ttl.from_text(token.value)
    548