Home | History | Annotate | Download | only in Lib
      1 # -*- coding: iso-8859-1 -*-

      2 """A lexical analyzer class for simple shell-like syntaxes."""
      3 
      4 # Module and documentation by Eric S. Raymond, 21 Dec 1998

      5 # Input stacking and error message cleanup added by ESR, March 2000

      6 # push_source() and pop_source() made explicit by ESR, January 2001.

      7 # Posix compliance, split(), string arguments, and

      8 # iterator interface by Gustavo Niemeyer, April 2003.

      9 
     10 import os.path
     11 import sys
     12 from collections import deque
     13 
     14 try:
     15     from cStringIO import StringIO
     16 except ImportError:
     17     from StringIO import StringIO
     18 
     19 __all__ = ["shlex", "split"]
     20 
     21 class shlex:
     22     "A lexical analyzer class for simple shell-like syntaxes."
     23     def __init__(self, instream=None, infile=None, posix=False):
     24         if isinstance(instream, basestring):
     25             instream = StringIO(instream)
     26         if instream is not None:
     27             self.instream = instream
     28             self.infile = infile
     29         else:
     30             self.instream = sys.stdin
     31             self.infile = None
     32         self.posix = posix
     33         if posix:
     34             self.eof = None
     35         else:
     36             self.eof = ''
     37         self.commenters = '#'
     38         self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
     39                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
     40         if self.posix:
     41             self.wordchars += (''
     42                                '')
     43         self.whitespace = ' \t\r\n'
     44         self.whitespace_split = False
     45         self.quotes = '\'"'
     46         self.escape = '\\'
     47         self.escapedquotes = '"'
     48         self.state = ' '
     49         self.pushback = deque()
     50         self.lineno = 1
     51         self.debug = 0
     52         self.token = ''
     53         self.filestack = deque()
     54         self.source = None
     55         if self.debug:
     56             print 'shlex: reading from %s, line %d' \
     57                   % (self.instream, self.lineno)
     58 
     59     def push_token(self, tok):
     60         "Push a token onto the stack popped by the get_token method"
     61         if self.debug >= 1:
     62             print "shlex: pushing token " + repr(tok)
     63         self.pushback.appendleft(tok)
     64 
     65     def push_source(self, newstream, newfile=None):
     66         "Push an input source onto the lexer's input source stack."
     67         if isinstance(newstream, basestring):
     68             newstream = StringIO(newstream)
     69         self.filestack.appendleft((self.infile, self.instream, self.lineno))
     70         self.infile = newfile
     71         self.instream = newstream
     72         self.lineno = 1
     73         if self.debug:
     74             if newfile is not None:
     75                 print 'shlex: pushing to file %s' % (self.infile,)
     76             else:
     77                 print 'shlex: pushing to stream %s' % (self.instream,)
     78 
     79     def pop_source(self):
     80         "Pop the input source stack."
     81         self.instream.close()
     82         (self.infile, self.instream, self.lineno) = self.filestack.popleft()
     83         if self.debug:
     84             print 'shlex: popping to %s, line %d' \
     85                   % (self.instream, self.lineno)
     86         self.state = ' '
     87 
     88     def get_token(self):
     89         "Get a token from the input stream (or from stack if it's nonempty)"
     90         if self.pushback:
     91             tok = self.pushback.popleft()
     92             if self.debug >= 1:
     93                 print "shlex: popping token " + repr(tok)
     94             return tok
     95         # No pushback.  Get a token.
     96         raw = self.read_token()
     97         # Handle inclusions
     98         if self.source is not None:
     99             while raw == self.source:
    100                 spec = self.sourcehook(self.read_token())
    101                 if spec:
    102                     (newfile, newstream) = spec
    103                     self.push_source(newstream, newfile)
    104                 raw = self.get_token()
    105         # Maybe we got EOF instead?
    106         while raw == self.eof:
    107             if not self.filestack:
    108                 return self.eof
    109             else:
    110                 self.pop_source()
    111                 raw = self.get_token()
    112         # Neither inclusion nor EOF
    113         if self.debug >= 1:
    114             if raw != self.eof:
    115                 print "shlex: token=" + repr(raw)
    116             else:
    117                 print "shlex: token=EOF"
    118         return raw
    119 
    120     def read_token(self):
    121         quoted = False
    122         escapedstate = ' '
    123         while True:
    124             nextchar = self.instream.read(1)
    125             if nextchar == '\n':
    126                 self.lineno = self.lineno + 1
    127             if self.debug >= 3:
    128                 print "shlex: in state", repr(self.state), \
    129                       "I see character:", repr(nextchar)
    130             if self.state is None:
    131                 self.token = ''        # past end of file
    132                 break
    133             elif self.state == ' ':
    134                 if not nextchar:
    135                     self.state = None  # end of file
    136                     break
    137                 elif nextchar in self.whitespace:
    138                     if self.debug >= 2:
    139                         print "shlex: I see whitespace in whitespace state"
    140                     if self.token or (self.posix and quoted):
    141                         break   # emit current token
    142                     else:
    143                         continue
    144                 elif nextchar in self.commenters:
    145                     self.instream.readline()
    146                     self.lineno = self.lineno + 1
    147                 elif self.posix and nextchar in self.escape:
    148                     escapedstate = 'a'
    149                     self.state = nextchar
    150                 elif nextchar in self.wordchars:
    151                     self.token = nextchar
    152                     self.state = 'a'
    153                 elif nextchar in self.quotes:
    154                     if not self.posix:
    155                         self.token = nextchar
    156                     self.state = nextchar
    157                 elif self.whitespace_split:
    158                     self.token = nextchar
    159                     self.state = 'a'
    160                 else:
    161                     self.token = nextchar
    162                     if self.token or (self.posix and quoted):
    163                         break   # emit current token
    164                     else:
    165                         continue
    166             elif self.state in self.quotes:
    167                 quoted = True
    168                 if not nextchar:      # end of file
    169                     if self.debug >= 2:
    170                         print "shlex: I see EOF in quotes state"
    171                     # XXX what error should be raised here?
    172                     raise ValueError, "No closing quotation"
    173                 if nextchar == self.state:
    174                     if not self.posix:
    175                         self.token = self.token + nextchar
    176                         self.state = ' '
    177                         break
    178                     else:
    179                         self.state = 'a'
    180                 elif self.posix and nextchar in self.escape and \
    181                      self.state in self.escapedquotes:
    182                     escapedstate = self.state
    183                     self.state = nextchar
    184                 else:
    185                     self.token = self.token + nextchar
    186             elif self.state in self.escape:
    187                 if not nextchar:      # end of file
    188                     if self.debug >= 2:
    189                         print "shlex: I see EOF in escape state"
    190                     # XXX what error should be raised here?
    191                     raise ValueError, "No escaped character"
    192                 # In posix shells, only the quote itself or the escape
    193                 # character may be escaped within quotes.
    194                 if escapedstate in self.quotes and \
    195                    nextchar != self.state and nextchar != escapedstate:
    196                     self.token = self.token + self.state
    197                 self.token = self.token + nextchar
    198                 self.state = escapedstate
    199             elif self.state == 'a':
    200                 if not nextchar:
    201                     self.state = None   # end of file
    202                     break
    203                 elif nextchar in self.whitespace:
    204                     if self.debug >= 2:
    205                         print "shlex: I see whitespace in word state"
    206                     self.state = ' '
    207                     if self.token or (self.posix and quoted):
    208                         break   # emit current token
    209                     else:
    210                         continue
    211                 elif nextchar in self.commenters:
    212                     self.instream.readline()
    213                     self.lineno = self.lineno + 1
    214                     if self.posix:
    215                         self.state = ' '
    216                         if self.token or (self.posix and quoted):
    217                             break   # emit current token
    218                         else:
    219                             continue
    220                 elif self.posix and nextchar in self.quotes:
    221                     self.state = nextchar
    222                 elif self.posix and nextchar in self.escape:
    223                     escapedstate = 'a'
    224                     self.state = nextchar
    225                 elif nextchar in self.wordchars or nextchar in self.quotes \
    226                     or self.whitespace_split:
    227                     self.token = self.token + nextchar
    228                 else:
    229                     self.pushback.appendleft(nextchar)
    230                     if self.debug >= 2:
    231                         print "shlex: I see punctuation in word state"
    232                     self.state = ' '
    233                     if self.token:
    234                         break   # emit current token
    235                     else:
    236                         continue
    237         result = self.token
    238         self.token = ''
    239         if self.posix and not quoted and result == '':
    240             result = None
    241         if self.debug > 1:
    242             if result:
    243                 print "shlex: raw token=" + repr(result)
    244             else:
    245                 print "shlex: raw token=EOF"
    246         return result
    247 
    248     def sourcehook(self, newfile):
    249         "Hook called on a filename to be sourced."
    250         if newfile[0] == '"':
    251             newfile = newfile[1:-1]
    252         # This implements cpp-like semantics for relative-path inclusion.
    253         if isinstance(self.infile, basestring) and not os.path.isabs(newfile):
    254             newfile = os.path.join(os.path.dirname(self.infile), newfile)
    255         return (newfile, open(newfile, "r"))
    256 
    257     def error_leader(self, infile=None, lineno=None):
    258         "Emit a C-compiler-like, Emacs-friendly error-message leader."
    259         if infile is None:
    260             infile = self.infile
    261         if lineno is None:
    262             lineno = self.lineno
    263         return "\"%s\", line %d: " % (infile, lineno)
    264 
    265     def __iter__(self):
    266         return self
    267 
    268     def next(self):
    269         token = self.get_token()
    270         if token == self.eof:
    271             raise StopIteration
    272         return token
    273 
    274 def split(s, comments=False, posix=True):
    275     lex = shlex(s, posix=posix)
    276     lex.whitespace_split = True
    277     if not comments:
    278         lex.commenters = ''
    279     return list(lex)
    280 
    281 if __name__ == '__main__':
    282     if len(sys.argv) == 1:
    283         lexer = shlex()
    284     else:
    285         file = sys.argv[1]
    286         lexer = shlex(open(file), file)
    287     while 1:
    288         tt = lexer.get_token()
    289         if tt:
    290             print "Token: " + repr(tt)
    291         else:
    292             break
    293