Home | History | Annotate | Download | only in Plex
      1 #=======================================================================
      2 #
      3 #   Python Lexical Analyser
      4 #
      5 #   Traditional Regular Expression Syntax
      6 #
      7 #=======================================================================
      8 
      9 from Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
     10 from Errors import PlexError
     11 
     12 class RegexpSyntaxError(PlexError):
     13   pass
     14 
     15 def re(s):
     16   """
     17   Convert traditional string representation of regular expression |s|
     18   into Plex representation.
     19   """
     20   return REParser(s).parse_re()
     21 
     22 class REParser(object):
     23 
     24   def __init__(self, s):
     25     self.s = s
     26     self.i = -1
     27     self.end = 0
     28     self.next()
     29 
     30   def parse_re(self):
     31     re = self.parse_alt()
     32     if not self.end:
     33       self.error("Unexpected %s" % repr(self.c))
     34     return re
     35 
     36   def parse_alt(self):
     37     """Parse a set of alternative regexps."""
     38     re = self.parse_seq()
     39     if self.c == '|':
     40       re_list = [re]
     41       while self.c == '|':
     42         self.next()
     43         re_list.append(self.parse_seq())
     44       re = Alt(*re_list)
     45     return re
     46 
     47   def parse_seq(self):
     48     """Parse a sequence of regexps."""
     49     re_list = []
     50     while not self.end and not self.c in "|)":
     51       re_list.append(self.parse_mod())
     52     return Seq(*re_list)
     53 
     54   def parse_mod(self):
     55     """Parse a primitive regexp followed by *, +, ? modifiers."""
     56     re = self.parse_prim()
     57     while not self.end and self.c in "*+?":
     58       if self.c == '*':
     59         re = Rep(re)
     60       elif self.c == '+':
     61         re = Rep1(re)
     62       else: # self.c == '?'
     63         re = Opt(re)
     64       self.next()
     65     return re
     66 
     67   def parse_prim(self):
     68     """Parse a primitive regexp."""
     69     c = self.get()
     70     if c == '.':
     71       re = AnyBut("\n")
     72     elif c == '^':
     73       re = Bol
     74     elif c == '$':
     75       re = Eol
     76     elif c == '(':
     77       re = self.parse_alt()
     78       self.expect(')')
     79     elif c == '[':
     80       re = self.parse_charset()
     81       self.expect(']')
     82     else:
     83       if c == '\\':
     84         c = self.get()
     85       re = Char(c)
     86     return re
     87 
     88   def parse_charset(self):
     89     """Parse a charset. Does not include the surrounding []."""
     90     char_list = []
     91     invert = 0
     92     if self.c == '^':
     93       invert = 1
     94       self.next()
     95     if self.c == ']':
     96       char_list.append(']')
     97       self.next()
     98     while not self.end and self.c != ']':
     99       c1 = self.get()
    100       if self.c == '-' and self.lookahead(1) != ']':
    101         self.next()
    102         c2 = self.get()
    103         for a in xrange(ord(c1), ord(c2) + 1):
    104           char_list.append(chr(a))
    105       else:
    106         char_list.append(c1)
    107     chars = ''.join(char_list)
    108     if invert:
    109       return AnyBut(chars)
    110     else:
    111       return Any(chars)
    112 
    113   def next(self):
    114     """Advance to the next char."""
    115     s = self.s
    116     i = self.i = self.i + 1
    117     if i < len(s):
    118       self.c = s[i]
    119     else:
    120       self.c = ''
    121       self.end = 1
    122 
    123   def get(self):
    124     if self.end:
    125       self.error("Premature end of string")
    126     c = self.c
    127     self.next()
    128     return c
    129 
    130   def lookahead(self, n):
    131     """Look ahead n chars."""
    132     j = self.i + n
    133     if j < len(self.s):
    134       return self.s[j]
    135     else:
    136       return ''
    137 
    138   def expect(self, c):
    139     """
    140     Expect to find character |c| at current position.
    141     Raises an exception otherwise.
    142     """
    143     if self.c == c:
    144       self.next()
    145     else:
    146       self.error("Missing %s" % repr(c))
    147 
    148   def error(self, mess):
    149     """Raise exception to signal syntax error in regexp."""
    150     raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
    151       repr(self.s), self.i, mess))
    152 
    153 
    154 
    155