1 #======================================================================= 2 # 3 # Python Lexical Analyser 4 # 5 # Traditional Regular Expression Syntax 6 # 7 #======================================================================= 8 9 from Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char 10 from Errors import PlexError 11 12 class RegexpSyntaxError(PlexError): 13 pass 14 15 def re(s): 16 """ 17 Convert traditional string representation of regular expression |s| 18 into Plex representation. 19 """ 20 return REParser(s).parse_re() 21 22 class REParser(object): 23 24 def __init__(self, s): 25 self.s = s 26 self.i = -1 27 self.end = 0 28 self.next() 29 30 def parse_re(self): 31 re = self.parse_alt() 32 if not self.end: 33 self.error("Unexpected %s" % repr(self.c)) 34 return re 35 36 def parse_alt(self): 37 """Parse a set of alternative regexps.""" 38 re = self.parse_seq() 39 if self.c == '|': 40 re_list = [re] 41 while self.c == '|': 42 self.next() 43 re_list.append(self.parse_seq()) 44 re = Alt(*re_list) 45 return re 46 47 def parse_seq(self): 48 """Parse a sequence of regexps.""" 49 re_list = [] 50 while not self.end and not self.c in "|)": 51 re_list.append(self.parse_mod()) 52 return Seq(*re_list) 53 54 def parse_mod(self): 55 """Parse a primitive regexp followed by *, +, ? modifiers.""" 56 re = self.parse_prim() 57 while not self.end and self.c in "*+?": 58 if self.c == '*': 59 re = Rep(re) 60 elif self.c == '+': 61 re = Rep1(re) 62 else: # self.c == '?' 63 re = Opt(re) 64 self.next() 65 return re 66 67 def parse_prim(self): 68 """Parse a primitive regexp.""" 69 c = self.get() 70 if c == '.': 71 re = AnyBut("\n") 72 elif c == '^': 73 re = Bol 74 elif c == '$': 75 re = Eol 76 elif c == '(': 77 re = self.parse_alt() 78 self.expect(')') 79 elif c == '[': 80 re = self.parse_charset() 81 self.expect(']') 82 else: 83 if c == '\\': 84 c = self.get() 85 re = Char(c) 86 return re 87 88 def parse_charset(self): 89 """Parse a charset. Does not include the surrounding [].""" 90 char_list = [] 91 invert = 0 92 if self.c == '^': 93 invert = 1 94 self.next() 95 if self.c == ']': 96 char_list.append(']') 97 self.next() 98 while not self.end and self.c != ']': 99 c1 = self.get() 100 if self.c == '-' and self.lookahead(1) != ']': 101 self.next() 102 c2 = self.get() 103 for a in xrange(ord(c1), ord(c2) + 1): 104 char_list.append(chr(a)) 105 else: 106 char_list.append(c1) 107 chars = ''.join(char_list) 108 if invert: 109 return AnyBut(chars) 110 else: 111 return Any(chars) 112 113 def next(self): 114 """Advance to the next char.""" 115 s = self.s 116 i = self.i = self.i + 1 117 if i < len(s): 118 self.c = s[i] 119 else: 120 self.c = '' 121 self.end = 1 122 123 def get(self): 124 if self.end: 125 self.error("Premature end of string") 126 c = self.c 127 self.next() 128 return c 129 130 def lookahead(self, n): 131 """Look ahead n chars.""" 132 j = self.i + n 133 if j < len(self.s): 134 return self.s[j] 135 else: 136 return '' 137 138 def expect(self, c): 139 """ 140 Expect to find character |c| at current position. 141 Raises an exception otherwise. 142 """ 143 if self.c == c: 144 self.next() 145 else: 146 self.error("Missing %s" % repr(c)) 147 148 def error(self, mess): 149 """Raise exception to signal syntax error in regexp.""" 150 raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( 151 repr(self.s), self.i, mess)) 152 153 154 155