Home | History | Annotate | Download | only in tld_cleanup
      1 #!/usr/bin/env python
      2 # Copyright 2014 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """
      7 A Deterministic acyclic finite state automaton (DAFSA) is a compact
      8 representation of an unordered word list (dictionary).
      9 
     10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
     11 
     12 This python program converts a list of strings to a byte array in C++.
     13 This python program fetches strings and return values from a gperf file
     14 and generates a C++ file with a byte array representing graph that can be
     15 used as a memory efficient replacement for the perfect hash table.
     16 
     17 The input strings are assumed to consist of printable 7-bit ASCII characters
     18 and the return values are assumed to be one digit integers.
     19 
     20 In this program a DAFSA is a diamond shaped graph starting at a common
     21 source node and ending at a common sink node. All internal nodes contain
     22 a label and each word is represented by the labels in one path from
     23 the source node to the sink node.
     24 
     25 The following python represention is used for nodes:
     26 
     27   Source node: [ children ]
     28   Internal node: (label, [ children ])
     29   Sink node: None
     30 
     31 The graph is first compressed by prefixes like a trie. In the next step
     32 suffixes are compressed so that the graph gets diamond shaped. Finally
     33 one to one linked nodes are replaced by nodes with the labels joined.
     34 
     35 The order of the operations is crucial since lookups will be performed
     36 starting from the source with no backtracking. Thus a node must have at
     37 most one child with a label starting by the same character. The output
     38 is also arranged so that all jumps are to increasing addresses, thus forward
     39 in memory.
     40 
     41 The generated output has suffix free decoding so that the sign of leading
     42 bits in a link (a reference to a child node) indicate if it has a size of one,
     43 two or three bytes and if it is the last outgoing link from the actual node.
     44 A node label is terminated by a byte with the leading bit set.
     45 
     46 The generated byte array can described by the following BNF:
     47 
     48 <byte> ::= < 8-bit value in range [0x00-0xFF] >
     49 
     50 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
     51 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
     52 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
     53 
     54 <offset1> ::= < byte in range [0x00-0x3F] >
     55 <offset2> ::= < byte in range [0x40-0x5F] >
     56 <offset3> ::= < byte in range [0x60-0x7F] >
     57 
     58 <end_offset1> ::= < byte in range [0x80-0xBF] >
     59 <end_offset2> ::= < byte in range [0xC0-0xDF] >
     60 <end_offset3> ::= < byte in range [0xE0-0xFF] >
     61 
     62 <prefix> ::= <char>
     63 
     64 <label> ::= <end_char>
     65           | <char> <label>
     66 
     67 <end_label> ::= <return_value>
     68           | <char> <end_label>
     69 
     70 <offset> ::= <offset1>
     71            | <offset2> <byte>
     72            | <offset3> <byte> <byte>
     73 
     74 <end_offset> ::= <end_offset1>
     75                | <end_offset2> <byte>
     76                | <end_offset3> <byte> <byte>
     77 
     78 <offsets> ::= <end_offset>
     79             | <offset> <offsets>
     80 
     81 <source> ::= <offsets>
     82 
     83 <node> ::= <label> <offsets>
     84          | <prefix> <node>
     85          | <end_label>
     86 
     87 <dafsa> ::= <source>
     88           | <dafsa> <node>
     89 
     90 Decoding:
     91 
     92 <char> -> printable 7-bit ASCII character
     93 <end_char> & 0x7F -> printable 7-bit ASCII character
     94 <return value> & 0x0F -> integer
     95 <offset1 & 0x3F> -> integer
     96 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
     97 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
     98 
     99 end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
    100 offset2 and offset3 respectively.
    101 
    102 The first offset in a list of offsets is the distance in bytes between the
    103 offset itself and the first child node. Subsequent offsets are the distance
    104 between previous child node and next child node. Thus each offset links a node
    105 to a child node. The distance is always counted between start addresses, i.e.
    106 first byte in decoded offset or first byte in child node.
    107 
    108 Example 1:
    109 
    110 %%
    111 aa, 1
    112 a, 2
    113 %%
    114 
    115 The input is first parsed to a list of words:
    116 ["aa1", "a2"]
    117 
    118 A fully expanded graph is created from the words:
    119 source = [node1, node4]
    120 node1 = ("a", [node2])
    121 node2 = ("a", [node3])
    122 node3 = ("\x01", [sink])
    123 node4 = ("a", [node5])
    124 node5 = ("\x02", [sink])
    125 sink = None
    126 
    127 Compression results in the following graph:
    128 source = [node1]
    129 node1 = ("a", [node2, node3])
    130 node2 = ("\x02", [sink])
    131 node3 = ("a\x01", [sink])
    132 sink = None
    133 
    134 A C++ representation of the compressed graph is generated:
    135 
    136 const unsigned char dafsa[7] = {
    137   0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
    138 };
    139 
    140 The bytes in the generated array has the following meaning:
    141 
    142  0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
    143 
    144  1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
    145  2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
    146 
    147  3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
    148  4: 0x82 <return_value> 0x82 & 0x0F -> return 2
    149 
    150  5: 0x61 <char>         label character 0x61 -> match "a"
    151  6: 0x81 <return_value> 0x81 & 0x0F -> return 1
    152 
    153 Example 2:
    154 
    155 %%
    156 aa, 1
    157 bbb, 2
    158 baa, 1
    159 %%
    160 
    161 The input is first parsed to a list of words:
    162 ["aa1", "bbb2", "baa1"]
    163 
    164 Compression results in the following graph:
    165 source = [node1, node2]
    166 node1 = ("b", [node2, node3])
    167 node2 = ("aa\x01", [sink])
    168 node3 = ("bb\x02", [sink])
    169 sink = None
    170 
    171 A C++ representation of the compressed graph is generated:
    172 
    173 const unsigned char dafsa[11] = {
    174   0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
    175 };
    176 
    177 The bytes in the generated array has the following meaning:
    178 
    179  0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
    180  1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
    181 
    182  2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
    183  3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
    184  4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
    185 
    186  5: 0x61 <char>         label character 0x61 -> match "a"
    187  6: 0x61 <char>         label character 0x61 -> match "a"
    188  7: 0x81 <return_value> 0x81 & 0x0F -> return 1
    189 
    190  8: 0x62 <char>         label character 0x62 -> match "b"
    191  9: 0x62 <char>         label character 0x62 -> match "b"
    192 10: 0x82 <return_value> 0x82 & 0x0F -> return 2
    193 """
    194 
    195 import sys
    196 
    197 class InputError(Exception):
    198   """Exception raised for errors in the input file."""
    199 
    200 
    201 def to_dafsa(words):
    202   """Generates a DAFSA from a word list and returns the source node.
    203 
    204   Each word is split into characters so that each character is represented by
    205   a unique node. It is assumed the word list is not empty.
    206   """
    207   if not words:
    208     raise InputError('The domain list must not be empty')
    209   def ToNodes(word):
    210     """Split words into characters"""
    211     if not 0x1F < ord(word[0]) < 0x80:
    212       raise InputError('Domain names must be printable 7-bit ASCII')
    213     if len(word) == 1:
    214       return chr(ord(word[0]) & 0x0F), [None]
    215     return word[0], [ToNodes(word[1:])]
    216   return [ToNodes(word) for word in words]
    217 
    218 
    219 def to_words(node):
    220   """Generates a word list from all paths starting from an internal node."""
    221   if not node:
    222     return ['']
    223   return [(node[0] + word) for child in node[1] for word in to_words(child)]
    224 
    225 
    226 def reverse(dafsa):
    227   """Generates a new DAFSA that is reversed, so that the old sink node becomes
    228   the new source node.
    229   """
    230   sink = []
    231   nodemap = {}
    232 
    233   def dfs(node, parent):
    234     """Creates reverse nodes.
    235 
    236     A new reverse node will be created for each old node. The new node will
    237     get a reversed label and the parents of the old node as children.
    238     """
    239     if not node:
    240       sink.append(parent)
    241     elif id(node) not in nodemap:
    242       nodemap[id(node)] = (node[0][::-1], [parent])
    243       for child in node[1]:
    244         dfs(child, nodemap[id(node)])
    245     else:
    246       nodemap[id(node)][1].append(parent)
    247 
    248   for node in dafsa:
    249     dfs(node, None)
    250   return sink
    251 
    252 
    253 def join_labels(dafsa):
    254   """Generates a new DAFSA where internal nodes are merged if there is a one to
    255   one connection.
    256   """
    257   parentcount = { id(None): 2 }
    258   nodemap = { id(None): None }
    259 
    260   def count_parents(node):
    261     """Count incoming references"""
    262     if id(node) in parentcount:
    263       parentcount[id(node)] += 1
    264     else:
    265       parentcount[id(node)] = 1
    266       for child in node[1]:
    267         count_parents(child)
    268 
    269   def join(node):
    270     """Create new nodes"""
    271     if id(node) not in nodemap:
    272       children = [join(child) for child in node[1]]
    273       if len(children) == 1 and parentcount[id(node[1][0])] == 1:
    274         child = children[0]
    275         nodemap[id(node)] = (node[0] + child[0], child[1])
    276       else:
    277         nodemap[id(node)] = (node[0], children)
    278     return nodemap[id(node)]
    279 
    280   for node in dafsa:
    281     count_parents(node)
    282   return [join(node) for node in dafsa]
    283 
    284 
    285 def join_suffixes(dafsa):
    286   """Generates a new DAFSA where nodes that represent the same word lists
    287   towards the sink are merged.
    288   """
    289   nodemap = { frozenset(('',)): None }
    290 
    291   def join(node):
    292     """Returns a macthing node. A new node is created if no matching node
    293     exists. The graph is accessed in dfs order.
    294     """
    295     suffixes = frozenset(to_words(node))
    296     if suffixes not in nodemap:
    297       nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
    298     return nodemap[suffixes]
    299 
    300   return [join(node) for node in dafsa]
    301 
    302 
    303 def top_sort(dafsa):
    304   """Generates list of nodes in topological sort order."""
    305   incoming = {}
    306 
    307   def count_incoming(node):
    308     """Counts incoming references."""
    309     if node:
    310       if id(node) not in incoming:
    311         incoming[id(node)] = 1
    312         for child in node[1]:
    313           count_incoming(child)
    314       else:
    315         incoming[id(node)] += 1
    316 
    317   for node in dafsa:
    318     count_incoming(node)
    319 
    320   for node in dafsa:
    321     incoming[id(node)] -= 1
    322 
    323   waiting = [node for node in dafsa if incoming[id(node)] == 0]
    324   nodes = []
    325 
    326   while waiting:
    327     node = waiting.pop()
    328     assert incoming[id(node)] == 0
    329     nodes.append(node)
    330     for child in node[1]:
    331       if child:
    332         incoming[id(child)] -= 1
    333         if incoming[id(child)] == 0:
    334           waiting.append(child)
    335   return nodes
    336 
    337 
    338 def encode_links(children, offsets, current):
    339   """Encodes a list of children as one, two or three byte offsets."""
    340   if not children[0]:
    341     # This is an <end_label> node and no links follow such nodes
    342     assert len(children) == 1
    343     return []
    344   guess = 3 * len(children)
    345   assert children
    346   children = sorted(children, key = lambda x: -offsets[id(x)])
    347   while True:
    348     offset = current + guess
    349     buf = []
    350     for child in children:
    351       last = len(buf)
    352       distance = offset - offsets[id(child)]
    353       assert distance > 0 and distance < (1 << 21)
    354 
    355       if distance < (1 << 6):
    356         # A 6-bit offset: "s0xxxxxx"
    357         buf.append(distance)
    358       elif distance < (1 << 13):
    359         # A 13-bit offset: "s10xxxxxxxxxxxxx"
    360         buf.append(0x40 | (distance >> 8))
    361         buf.append(distance & 0xFF)
    362       else:
    363         # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
    364         buf.append(0x60 | (distance >> 16))
    365         buf.append((distance >> 8) & 0xFF)
    366         buf.append(distance & 0xFF)
    367       # Distance in first link is relative to following record.
    368       # Distance in other links are relative to previous link.
    369       offset -= distance
    370     if len(buf) == guess:
    371       break
    372     guess = len(buf)
    373   # Set most significant bit to mark end of links in this node.
    374   buf[last] |= (1 << 7)
    375   buf.reverse()
    376   return buf
    377 
    378 
    379 def encode_prefix(label):
    380   """Encodes a node label as a list of bytes without a trailing high byte.
    381 
    382   This method encodes a node if there is exactly one child  and the
    383   child follows immidiately after so that no jump is needed. This label
    384   will then be a prefix to the label in the child node.
    385   """
    386   assert label
    387   return [ord(c) for c in reversed(label)]
    388 
    389 
    390 def encode_label(label):
    391   """Encodes a node label as a list of bytes with a trailing high byte >0x80.
    392   """
    393   buf = encode_prefix(label)
    394   # Set most significant bit to mark end of label in this node.
    395   buf[0] |= (1 << 7)
    396   return buf
    397 
    398 
    399 def encode(dafsa):
    400   """Encodes a DAFSA to a list of bytes"""
    401   output = []
    402   offsets = {}
    403 
    404   for node in reversed(top_sort(dafsa)):
    405     if (len(node[1]) == 1 and node[1][0] and
    406         (offsets[id(node[1][0])] == len(output))):
    407       output.extend(encode_prefix(node[0]))
    408     else:
    409       output.extend(encode_links(node[1], offsets, len(output)))
    410       output.extend(encode_label(node[0]))
    411     offsets[id(node)] = len(output)
    412 
    413   output.extend(encode_links(dafsa, offsets, len(output)))
    414   output.reverse()
    415   return output
    416 
    417 
    418 def to_cxx(data):
    419   """Generates C++ code from a list of encoded bytes."""
    420   text = '/* This file is generated. DO NOT EDIT!\n\n'
    421   text += 'The byte array encodes effective tld names. See make_dafsa.py for'
    422   text += ' documentation.'
    423   text += '*/\n\n'
    424   text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
    425   for i in range(0, len(data), 12):
    426     text += '  '
    427     text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
    428     text += ',\n'
    429   text += '};\n'
    430   return text
    431 
    432 
    433 def words_to_cxx(words):
    434   """Generates C++ code from a word list"""
    435   dafsa = to_dafsa(words)
    436   for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
    437     dafsa = fun(dafsa)
    438   return to_cxx(encode(dafsa))
    439 
    440 
    441 def parse_gperf(infile):
    442   """Parses gperf file and extract strings and return code"""
    443   lines = [line.strip() for line in infile]
    444   # Extract strings after the first '%%' and before the second '%%'.
    445   begin = lines.index('%%') + 1
    446   end = lines.index('%%', begin)
    447   lines = lines[begin:end]
    448   for line in lines:
    449     if line[-3:-1] != ', ':
    450       raise InputError('Expected "domainname, <digit>", found "%s"' % line)
    451     # Technically the DAFSA format could support return values in range [0-31],
    452     # but the values below are the only with a defined meaning.
    453     if line[-1] not in '0124':
    454       raise InputError('Expected value to be one of {0,1,2,4}, found "%s"' %
    455                        line[-1])
    456   return [line[:-3] + line[-1] for line in lines]
    457 
    458 
    459 def main():
    460   if len(sys.argv) != 3:
    461     print('usage: %s infile outfile' % sys.argv[0])
    462     return 1
    463   with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile:
    464     outfile.write(words_to_cxx(parse_gperf(infile)))
    465   return 0
    466 
    467 
    468 if __name__ == '__main__':
    469   sys.exit(main())
    470