Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env python
      2 
      3 from urllib2 import urlopen
      4 
      5 TLD_PREFIX = r"""
      6     /**
      7      *  Regular expression to match all IANA top-level domains.
      8      *  List accurate as of 2011/07/18.  List taken from:
      9      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     10      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     11      */
     12     public static final String TOP_LEVEL_DOMAIN_STR =
     13 """
     14 TLD_SUFFIX = '";'
     15 
     16 URL_PREFIX = r"""
     17     /**
     18      *  Regular expression to match all IANA top-level domains for WEB_URL.
     19      *  List accurate as of 2011/07/18.  List taken from:
     20      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     21      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     22      */
     23     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
     24         "(?:"
     25 """
     26 
     27 URL_SUFFIX = ';'
     28 TAB = '        '
     29 
     30 class BucketOutput:
     31     def __init__(self):
     32         self.buffer = TAB
     33         self.lineLength = len(TAB)
     34 
     35     def __iadd__(self, other):
     36         self.buffer += other
     37         self.lineLength += len(other)
     38         return self
     39 
     40     def addPipe(self):
     41         if self.lineLength > 90:
     42             self.buffer += '"\n'
     43             self.buffer += TAB
     44             self.buffer += '+ "'
     45             self.lineLength = len(TAB)
     46 
     47         self += '|'
     48 
     49     def value(self):
     50         return self.buffer
     51 
     52 class Bucket:
     53     def __init__(self, baseLetter):
     54         self.base=baseLetter
     55         self.words=[]
     56         self.letters=[]
     57 
     58     def dump(self, isWebUrl=False, isFirst=False, isLast=False):
     59         if (len(self.words) == 0) and (len(self.letters) == 0):
     60             return ''
     61 
     62         self.words.sort()
     63         self.letters.sort()
     64 
     65         output = BucketOutput()
     66 
     67         if isFirst:
     68             if isWebUrl:
     69                 output += '+ "'
     70             else:
     71                 output += '"('
     72         else:
     73             output += '+ "|'
     74 
     75         if len(self.words) != 0:
     76             output += '('
     77 
     78             if isWebUrl:
     79                 output += '?:'
     80 
     81         firstWord = 1
     82         for word in self.words:
     83             if firstWord == 0:
     84                 output.addPipe()
     85             firstWord = 0
     86             for letter in word:
     87                 if letter == '-':
     88                     output += '\\\\'  # escape the '-' character.
     89                 output += letter
     90 
     91         if len(self.words) > 0 and len(self.letters) > 0:
     92             output.addPipe()
     93 
     94         if len(self.letters) == 1:
     95             output += '%c%c' % (self.base, self.letters[0])
     96         elif len(self.letters) > 0:
     97             output += '%c[' % self.base
     98 
     99             for letter in self.letters:
    100                 output += letter
    101 
    102             output += ']'
    103 
    104         if len(self.words) != 0:
    105             output += ')'
    106 
    107         if not isLast:
    108             output += '"'
    109             output += '\n'
    110 
    111         return output.value();
    112 
    113     def add(self, line):
    114         length = len(line)
    115 
    116         if line.startswith('#') or (length == 0):
    117             return;
    118 
    119         if length == 2:
    120             self.letters.append(line[1:2])
    121         else:
    122             self.words.append(line)
    123 
    124 def getBucket(buckets, line):
    125     letter = line[0]
    126     bucket = buckets.get(letter)
    127 
    128     if bucket is None:
    129         bucket = Bucket(letter)
    130         buckets[letter] = bucket
    131 
    132     return bucket
    133 
    134 def makePattern(prefix, suffix, buckets, isWebUrl=False):
    135     output = prefix
    136 
    137     output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
    138 
    139     for letter in range(ord('b'), ord('z')):
    140         output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
    141 
    142     output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
    143 
    144     if isWebUrl:
    145         output += '))"'
    146     else:
    147         output += ')'
    148 
    149     output += suffix
    150 
    151     print output
    152 
    153 if __name__ == "__main__":
    154     f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    155     domains = f.readlines()
    156     f.close()
    157 
    158     buckets = {}
    159 
    160     for domain in domains:
    161         domain = domain.lower()
    162 
    163         if len(domain) > 0:
    164             getBucket(buckets, domain[0]).add(domain.strip())
    165 
    166         if domain.startswith('xn--'):
    167 	   puny = domain.strip()[4:]
    168 	   result = puny.decode('punycode')
    169 	   result = repr(result)
    170            getBucket(buckets, 'xn--').add(result[2:-1])
    171 
    172     makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
    173     makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
    174