Home | History | Annotate | Download | only in Plex
      1 #=======================================================================
      2 #
      3 #     Python Lexical Analyser
      4 #
      5 #     Regular Expressions
      6 #
      7 #=======================================================================
      8 
      9 import types
     10 from sys import maxint as maxint
     11 
     12 import Errors
     13 
     14 #
     15 #     Constants
     16 #
     17 
     18 BOL = 'bol'
     19 EOL = 'eol'
     20 EOF = 'eof'
     21 
     22 nl_code = ord('\n')
     23 
     24 #
     25 #     Helper functions
     26 #
     27 
     28 def chars_to_ranges(s):
     29     """
     30     Return a list of character codes consisting of pairs
     31     [code1a, code1b, code2a, code2b,...] which cover all
     32     the characters in |s|.
     33     """
     34     char_list = list(s)
     35     char_list.sort()
     36     i = 0
     37     n = len(char_list)
     38     result = []
     39     while i < n:
     40         code1 = ord(char_list[i])
     41         code2 = code1 + 1
     42         i = i + 1
     43         while i < n and code2 >= ord(char_list[i]):
     44             code2 = code2 + 1
     45             i = i + 1
     46         result.append(code1)
     47         result.append(code2)
     48     return result
     49 
     50 def uppercase_range(code1, code2):
     51     """
     52     If the range of characters from code1 to code2-1 includes any
     53     lower case letters, return the corresponding upper case range.
     54     """
     55     code3 = max(code1, ord('a'))
     56     code4 = min(code2, ord('z') + 1)
     57     if code3 < code4:
     58         d = ord('A') - ord('a')
     59         return (code3 + d, code4 + d)
     60     else:
     61         return None
     62 
     63 def lowercase_range(code1, code2):
     64     """
     65     If the range of characters from code1 to code2-1 includes any
     66     upper case letters, return the corresponding lower case range.
     67     """
     68     code3 = max(code1, ord('A'))
     69     code4 = min(code2, ord('Z') + 1)
     70     if code3 < code4:
     71         d = ord('a') - ord('A')
     72         return (code3 + d, code4 + d)
     73     else:
     74         return None
     75 
     76 def CodeRanges(code_list):
     77     """
     78     Given a list of codes as returned by chars_to_ranges, return
     79     an RE which will match a character in any of the ranges.
     80     """
     81     re_list = []
     82     for i in xrange(0, len(code_list), 2):
     83         re_list.append(CodeRange(code_list[i], code_list[i + 1]))
     84     return Alt(*re_list)
     85 
     86 def CodeRange(code1, code2):
     87     """
     88     CodeRange(code1, code2) is an RE which matches any character
     89     with a code |c| in the range |code1| <= |c| < |code2|.
     90     """
     91     if code1 <= nl_code < code2:
     92         return Alt(RawCodeRange(code1, nl_code),
     93                              RawNewline,
     94                              RawCodeRange(nl_code + 1, code2))
     95     else:
     96         return RawCodeRange(code1, code2)
     97 
     98 #
     99 #     Abstract classes
    100 #
    101 
    102 class RE(object):
    103     """RE is the base class for regular expression constructors.
    104     The following operators are defined on REs:
    105 
    106          re1 + re2         is an RE which matches |re1| followed by |re2|
    107          re1 | re2         is an RE which matches either |re1| or |re2|
    108     """
    109 
    110     nullable = 1 # True if this RE can match 0 input symbols
    111     match_nl = 1 # True if this RE can match a string ending with '\n'
    112     str = None     # Set to a string to override the class's __str__ result
    113 
    114     def build_machine(self, machine, initial_state, final_state,
    115                                         match_bol, nocase):
    116         """
    117         This method should add states to |machine| to implement this
    118         RE, starting at |initial_state| and ending at |final_state|.
    119         If |match_bol| is true, the RE must be able to match at the
    120         beginning of a line. If nocase is true, upper and lower case
    121         letters should be treated as equivalent.
    122         """
    123         raise NotImplementedError("%s.build_machine not implemented" %
    124             self.__class__.__name__)
    125 
    126     def build_opt(self, m, initial_state, c):
    127         """
    128         Given a state |s| of machine |m|, return a new state
    129         reachable from |s| on character |c| or epsilon.
    130         """
    131         s = m.new_state()
    132         initial_state.link_to(s)
    133         initial_state.add_transition(c, s)
    134         return s
    135 
    136     def __add__(self, other):
    137         return Seq(self, other)
    138 
    139     def __or__(self, other):
    140         return Alt(self, other)
    141 
    142     def __str__(self):
    143         if self.str:
    144             return self.str
    145         else:
    146             return self.calc_str()
    147 
    148     def check_re(self, num, value):
    149         if not isinstance(value, RE):
    150             self.wrong_type(num, value, "Plex.RE instance")
    151 
    152     def check_string(self, num, value):
    153         if type(value) != type(''):
    154             self.wrong_type(num, value, "string")
    155 
    156     def check_char(self, num, value):
    157         self.check_string(num, value)
    158         if len(value) != 1:
    159             raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
    160                 "Expected a string of length 1, got: %s" % (
    161                     num, self.__class__.__name__, repr(value)))
    162 
    163     def wrong_type(self, num, value, expected):
    164         if type(value) == types.InstanceType:
    165                 got = "%s.%s instance" % (
    166                     value.__class__.__module__, value.__class__.__name__)
    167         else:
    168             got = type(value).__name__
    169         raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
    170                                         "(expected %s, got %s" % (
    171                                             num, self.__class__.__name__, expected, got))
    172 
    173 #
    174 #     Primitive RE constructors
    175 #     -------------------------
    176 #
    177 #     These are the basic REs from which all others are built.
    178 #
    179 
    180 ## class Char(RE):
    181 ##     """
    182 ##     Char(c) is an RE which matches the character |c|.
    183 ##     """
    184 
    185 ##     nullable = 0
    186 
    187 ##     def __init__(self, char):
    188 ##         self.char = char
    189 ##         self.match_nl = char == '\n'
    190 
    191 ##     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    192 ##         c = self.char
    193 ##         if match_bol and c != BOL:
    194 ##             s1 = self.build_opt(m, initial_state, BOL)
    195 ##         else:
    196 ##             s1 = initial_state
    197 ##         if c == '\n' or c == EOF:
    198 ##             s1 = self.build_opt(m, s1, EOL)
    199 ##         if len(c) == 1:
    200 ##             code = ord(self.char)
    201 ##             s1.add_transition((code, code+1), final_state)
    202 ##             if nocase and is_letter_code(code):
    203 ##                 code2 = other_case_code(code)
    204 ##                 s1.add_transition((code2, code2+1), final_state)
    205 ##         else:
    206 ##             s1.add_transition(c, final_state)
    207 
    208 ##     def calc_str(self):
    209 ##         return "Char(%s)" % repr(self.char)
    210 
    211 def Char(c):
    212     """
    213     Char(c) is an RE which matches the character |c|.
    214     """
    215     if len(c) == 1:
    216         result = CodeRange(ord(c), ord(c) + 1)
    217     else:
    218         result = SpecialSymbol(c)
    219     result.str = "Char(%s)" % repr(c)
    220     return result
    221 
    222 class RawCodeRange(RE):
    223     """
    224     RawCodeRange(code1, code2) is a low-level RE which matches any character
    225     with a code |c| in the range |code1| <= |c| < |code2|, where the range
    226     does not include newline. For internal use only.
    227     """
    228     nullable = 0
    229     match_nl = 0
    230     range = None                     # (code, code)
    231     uppercase_range = None # (code, code) or None
    232     lowercase_range = None # (code, code) or None
    233 
    234     def __init__(self, code1, code2):
    235         self.range = (code1, code2)
    236         self.uppercase_range = uppercase_range(code1, code2)
    237         self.lowercase_range = lowercase_range(code1, code2)
    238 
    239     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    240         if match_bol:
    241             initial_state = self.build_opt(m, initial_state, BOL)
    242         initial_state.add_transition(self.range, final_state)
    243         if nocase:
    244             if self.uppercase_range:
    245                 initial_state.add_transition(self.uppercase_range, final_state)
    246             if self.lowercase_range:
    247                 initial_state.add_transition(self.lowercase_range, final_state)
    248 
    249     def calc_str(self):
    250         return "CodeRange(%d,%d)" % (self.code1, self.code2)
    251 
    252 class _RawNewline(RE):
    253     """
    254     RawNewline is a low-level RE which matches a newline character.
    255     For internal use only.
    256     """
    257     nullable = 0
    258     match_nl = 1
    259 
    260     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    261         if match_bol:
    262             initial_state = self.build_opt(m, initial_state, BOL)
    263         s = self.build_opt(m, initial_state, EOL)
    264         s.add_transition((nl_code, nl_code + 1), final_state)
    265 
    266 RawNewline = _RawNewline()
    267 
    268 
    269 class SpecialSymbol(RE):
    270     """
    271     SpecialSymbol(sym) is an RE which matches the special input
    272     symbol |sym|, which is one of BOL, EOL or EOF.
    273     """
    274     nullable = 0
    275     match_nl = 0
    276     sym = None
    277 
    278     def __init__(self, sym):
    279         self.sym = sym
    280 
    281     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    282         # Sequences 'bol bol' and 'bol eof' are impossible, so only need
    283         # to allow for bol if sym is eol
    284         if match_bol and self.sym == EOL:
    285             initial_state = self.build_opt(m, initial_state, BOL)
    286         initial_state.add_transition(self.sym, final_state)
    287 
    288 
    289 class Seq(RE):
    290     """Seq(re1, re2, re3...) is an RE which matches |re1| followed by
    291     |re2| followed by |re3|..."""
    292 
    293     def __init__(self, *re_list):
    294         nullable = 1
    295         for i in xrange(len(re_list)):
    296             re = re_list[i]
    297             self.check_re(i, re)
    298             nullable = nullable and re.nullable
    299         self.re_list = re_list
    300         self.nullable = nullable
    301         i = len(re_list)
    302         match_nl = 0
    303         while i:
    304             i = i - 1
    305             re = re_list[i]
    306             if re.match_nl:
    307                 match_nl = 1
    308                 break
    309             if not re.nullable:
    310                 break
    311         self.match_nl = match_nl
    312 
    313     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    314         re_list = self.re_list
    315         if len(re_list) == 0:
    316             initial_state.link_to(final_state)
    317         else:
    318             s1 = initial_state
    319             n = len(re_list)
    320             for i in xrange(n):
    321                 if i < n - 1:
    322                     s2 = m.new_state()
    323                 else:
    324                     s2 = final_state
    325                 re = re_list[i]
    326                 re.build_machine(m, s1, s2, match_bol, nocase)
    327                 s1 = s2
    328                 match_bol = re.match_nl or (match_bol and re.nullable)
    329 
    330     def calc_str(self):
    331         return "Seq(%s)" % ','.join(map(str, self.re_list))
    332 
    333 
    334 class Alt(RE):
    335     """Alt(re1, re2, re3...) is an RE which matches either |re1| or
    336     |re2| or |re3|..."""
    337 
    338     def __init__(self, *re_list):
    339         self.re_list = re_list
    340         nullable = 0
    341         match_nl = 0
    342         nullable_res = []
    343         non_nullable_res = []
    344         i = 1
    345         for re in re_list:
    346             self.check_re(i, re)
    347             if re.nullable:
    348                 nullable_res.append(re)
    349                 nullable = 1
    350             else:
    351                 non_nullable_res.append(re)
    352             if re.match_nl:
    353                 match_nl = 1
    354             i = i + 1
    355         self.nullable_res = nullable_res
    356         self.non_nullable_res = non_nullable_res
    357         self.nullable = nullable
    358         self.match_nl = match_nl
    359 
    360     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    361         for re in self.nullable_res:
    362             re.build_machine(m, initial_state, final_state, match_bol, nocase)
    363         if self.non_nullable_res:
    364             if match_bol:
    365                 initial_state = self.build_opt(m, initial_state, BOL)
    366             for re in self.non_nullable_res:
    367                 re.build_machine(m, initial_state, final_state, 0, nocase)
    368 
    369     def calc_str(self):
    370         return "Alt(%s)" % ','.join(map(str, self.re_list))
    371 
    372 
    373 class Rep1(RE):
    374     """Rep1(re) is an RE which matches one or more repetitions of |re|."""
    375 
    376     def __init__(self, re):
    377         self.check_re(1, re)
    378         self.re = re
    379         self.nullable = re.nullable
    380         self.match_nl = re.match_nl
    381 
    382     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    383         s1 = m.new_state()
    384         s2 = m.new_state()
    385         initial_state.link_to(s1)
    386         self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
    387         s2.link_to(s1)
    388         s2.link_to(final_state)
    389 
    390     def calc_str(self):
    391         return "Rep1(%s)" % self.re
    392 
    393 
    394 class SwitchCase(RE):
    395     """
    396     SwitchCase(re, nocase) is an RE which matches the same strings as RE,
    397     but treating upper and lower case letters according to |nocase|. If
    398     |nocase| is true, case is ignored, otherwise it is not.
    399     """
    400     re = None
    401     nocase = None
    402 
    403     def __init__(self, re, nocase):
    404         self.re = re
    405         self.nocase = nocase
    406         self.nullable = re.nullable
    407         self.match_nl = re.match_nl
    408 
    409     def build_machine(self, m, initial_state, final_state, match_bol, nocase):
    410         self.re.build_machine(m, initial_state, final_state, match_bol,
    411                                                     self.nocase)
    412 
    413     def calc_str(self):
    414         if self.nocase:
    415             name = "NoCase"
    416         else:
    417             name = "Case"
    418         return "%s(%s)" % (name, self.re)
    419 
    420 #
    421 #     Composite RE constructors
    422 #     -------------------------
    423 #
    424 #     These REs are defined in terms of the primitive REs.
    425 #
    426 
    427 Empty = Seq()
    428 Empty.__doc__ = \
    429     """
    430     Empty is an RE which matches the empty string.
    431     """
    432 Empty.str = "Empty"
    433 
    434 def Str1(s):
    435     """
    436     Str1(s) is an RE which matches the literal string |s|.
    437     """
    438     result = Seq(*tuple(map(Char, s)))
    439     result.str = "Str(%s)" % repr(s)
    440     return result
    441 
    442 def Str(*strs):
    443     """
    444     Str(s) is an RE which matches the literal string |s|.
    445     Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
    446     """
    447     if len(strs) == 1:
    448         return Str1(strs[0])
    449     else:
    450         result = Alt(*tuple(map(Str1, strs)))
    451         result.str = "Str(%s)" % ','.join(map(repr, strs))
    452         return result
    453 
    454 def Any(s):
    455     """
    456     Any(s) is an RE which matches any character in the string |s|.
    457     """
    458     #result = apply(Alt, tuple(map(Char, s)))
    459     result = CodeRanges(chars_to_ranges(s))
    460     result.str = "Any(%s)" % repr(s)
    461     return result
    462 
    463 def AnyBut(s):
    464     """
    465     AnyBut(s) is an RE which matches any character (including
    466     newline) which is not in the string |s|.
    467     """
    468     ranges = chars_to_ranges(s)
    469     ranges.insert(0, -maxint)
    470     ranges.append(maxint)
    471     result = CodeRanges(ranges)
    472     result.str = "AnyBut(%s)" % repr(s)
    473     return result
    474 
    475 AnyChar = AnyBut("")
    476 AnyChar.__doc__ = \
    477     """
    478     AnyChar is an RE which matches any single character (including a newline).
    479     """
    480 AnyChar.str = "AnyChar"
    481 
    482 def Range(s1, s2 = None):
    483     """
    484     Range(c1, c2) is an RE which matches any single character in the range
    485     |c1| to |c2| inclusive.
    486     Range(s) where |s| is a string of even length is an RE which matches
    487     any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
    488     """
    489     if s2:
    490         result = CodeRange(ord(s1), ord(s2) + 1)
    491         result.str = "Range(%s,%s)" % (s1, s2)
    492     else:
    493         ranges = []
    494         for i in range(0, len(s1), 2):
    495             ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))
    496         result = Alt(*ranges)
    497         result.str = "Range(%s)" % repr(s1)
    498     return result
    499 
    500 def Opt(re):
    501     """
    502     Opt(re) is an RE which matches either |re| or the empty string.
    503     """
    504     result = Alt(re, Empty)
    505     result.str = "Opt(%s)" % re
    506     return result
    507 
    508 def Rep(re):
    509     """
    510     Rep(re) is an RE which matches zero or more repetitions of |re|.
    511     """
    512     result = Opt(Rep1(re))
    513     result.str = "Rep(%s)" % re
    514     return result
    515 
    516 def NoCase(re):
    517     """
    518     NoCase(re) is an RE which matches the same strings as RE, but treating
    519     upper and lower case letters as equivalent.
    520     """
    521     return SwitchCase(re, nocase = 1)
    522 
    523 def Case(re):
    524     """
    525     Case(re) is an RE which matches the same strings as RE, but treating
    526     upper and lower case letters as distinct, i.e. it cancels the effect
    527     of any enclosing NoCase().
    528     """
    529     return SwitchCase(re, nocase = 0)
    530 
    531 #
    532 #     RE Constants
    533 #
    534 
    535 Bol = Char(BOL)
    536 Bol.__doc__ = \
    537     """
    538     Bol is an RE which matches the beginning of a line.
    539     """
    540 Bol.str = "Bol"
    541 
    542 Eol = Char(EOL)
    543 Eol.__doc__ = \
    544     """
    545     Eol is an RE which matches the end of a line.
    546     """
    547 Eol.str = "Eol"
    548 
    549 Eof = Char(EOF)
    550 Eof.__doc__ = \
    551     """
    552     Eof is an RE which matches the end of the file.
    553     """
    554 Eof.str = "Eof"
    555 
    556