Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env python
      2 
      3 from urllib2 import urlopen
      4 
      5 TLD_PREFIX = r"""
      6     /**
      7      *  Regular expression to match all IANA top-level domains.
      8      *  List accurate as of 2011/07/18.  List taken from:
      9      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     10      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     11      */
     12     public static final String TOP_LEVEL_DOMAIN_STR =
     13 """
     14 TLD_SUFFIX = '";'
     15 
     16 URL_PREFIX = r"""
     17     /**
     18      *  Regular expression to match all IANA top-level domains for WEB_URL.
     19      *  List accurate as of 2011/07/18.  List taken from:
     20      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     21      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     22      */
     23     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
     24         "(?:"
     25 """
     26 
     27 URL_SUFFIX = ';'
     28 
     29 class Bucket:
     30     def __init__(self, baseLetter):
     31         self.base=baseLetter
     32         self.words=[]
     33         self.letters=[]
     34 
     35     def dump(self, isWebUrl=False, isFirst=False, isLast=False):
     36         if (len(self.words) == 0) and (len(self.letters) == 0):
     37             return ''
     38 
     39         self.words.sort()
     40         self.letters.sort()
     41 
     42         output = '        ';
     43 
     44         if isFirst:
     45             if isWebUrl:
     46                 output += '+ "'
     47             else:
     48                 output += '"('
     49         else:
     50             output += '+ "|'
     51 
     52         if len(self.words) != 0:
     53             output += '('
     54 
     55             if isWebUrl:
     56                 output += '?:'
     57 
     58         firstWord = 1
     59         for word in self.words:
     60             if firstWord == 0:
     61                 output += '|'
     62             firstWord = 0
     63             for letter in word:
     64                 if letter == '-':
     65                     output += '\\\\'  # escape the '-' character.
     66                 output += letter
     67 
     68         if len(self.words) > 0 and len(self.letters) > 0:
     69             output += '|'
     70 
     71         if len(self.letters) == 1:
     72             output += '%c%c' % (self.base, self.letters[0])
     73         elif len(self.letters) > 0:
     74             output += '%c[' % self.base
     75 
     76             for letter in self.letters:
     77                 output += letter
     78 
     79             output += ']'
     80 
     81         if len(self.words) != 0:
     82             output += ')'
     83 
     84         if not isLast:
     85             output += '"'
     86             output += '\n'
     87 
     88         return output;
     89 
     90     def add(self, line):
     91         length = len(line)
     92 
     93         if line.startswith('#') or (length == 0):
     94             return;
     95 
     96         if length == 2:
     97             self.letters.append(line[1:2])
     98         else:
     99             self.words.append(line)
    100 
    101 def getBucket(buckets, line):
    102     letter = line[0]
    103     bucket = buckets.get(letter)
    104 
    105     if bucket is None:
    106         bucket = Bucket(letter)
    107         buckets[letter] = bucket
    108 
    109     return bucket
    110 
    111 def makePattern(prefix, suffix, buckets, isWebUrl=False):
    112     output = prefix
    113 
    114     output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
    115 
    116     for letter in range(ord('b'), ord('z')):
    117         output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
    118 
    119     output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
    120 
    121     if isWebUrl:
    122         output += '))"'
    123     else:
    124         output += ')'
    125 
    126     output += suffix
    127 
    128     print output
    129 
    130 if __name__ == "__main__":
    131     f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    132     domains = f.readlines()
    133     f.close()
    134 
    135     buckets = {}
    136 
    137     for domain in domains:
    138         domain = domain.lower()
    139 
    140         if len(domain) > 0:
    141             getBucket(buckets, domain[0]).add(domain.strip())
    142 
    143         if domain.startswith('xn--'):
    144 	   puny = domain.strip()[4:]
    145 	   result = puny.decode('punycode')
    146 	   result = repr(result)
    147            getBucket(buckets, 'xn--').add(result[2:-1])
    148 
    149     makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
    150     makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
    151