Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/env python
      2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Process a History database and dump a .dot file suitable for GraphViz.
      7 
      8 This is useful for debugging history redirect flows.
      9 
     10 An example run of this program:
     11   python /src/history-viz.py History > foo.dot
     12   /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
     13 """
     14 
     15 import struct
     16 import subprocess
     17 import sys
     18 import urlparse
     19 
     20 
     21 # Some transition types, copied from page_transition_types.h.
     22 TRANS_TYPES = {
     23   0: 'link',
     24   1: 'typed',
     25   2: 'most-visited',
     26   3: 'auto subframe',
     27   7: 'form',
     28 }
     29 
     30 
     31 class URL(object):
     32   """Represents a broken-down URL from our most visited database."""
     33 
     34   def __init__(self, id, url):
     35     """Initialize a new URL object.  |id| is the database id of the URL."""
     36     self.id = id
     37     self.url = url
     38     scheme, loc, path, query, fragment = urlparse.urlsplit(url)
     39     if scheme == 'http':
     40       scheme = ''  # Shorten for display purposes.
     41     if len(scheme) > 0:
     42       scheme += '://'
     43     self.host = scheme + loc
     44     self.path = path
     45 
     46     extra = ''
     47     if len(query) > 0:
     48       extra += '?' + query
     49     if len(fragment) > 0 or url.find('#') > 0:
     50       extra += '#' + fragment
     51     self.extra = extra
     52 
     53   def PrettyPrint(self, include_host=True, include_path=True):
     54     """Pretty-print this URL in a form more suitable for the graph.
     55 
     56     This will elide very long paths and potentially puts newlines between parts
     57     of long components.  include_host and include_path determine whether to
     58     include the host and path in the output.
     59 
     60     Returns: the pretty-printed string."""
     61     MAX_LEN = 30  # Maximum length of a line in the output.
     62     parts = []
     63     if include_host:
     64       parts.append(self.host)
     65     if include_path:
     66       parts.append(self.path)
     67     parts.append(self.extra)
     68     lines = []
     69     line = ''
     70     for part in parts:
     71       if len(part) > MAX_LEN:
     72         part = part[0:MAX_LEN-3] + '...'
     73       if len(line)+len(part) > MAX_LEN:
     74         lines.append(line)
     75         line = ''
     76       line += part
     77     if len(line) > 0:
     78       lines.append(line)
     79     return '\n'.join(lines)
     80 
     81 
     82 class Edge(object):
     83   """Represents an edge in the history graph, connecting two pages.
     84 
     85   If a link is traversed twice, it is one Edge with two entries in
     86   the .transitions array."""
     87   def __init__(self, src, dst):
     88     self.src = src
     89     self.dst = dst
     90     self.transitions = []
     91 
     92   def Transitions(self):
     93     """Return a dictionary mapping transition type -> occurences."""
     94     all = {}
     95     for trans in self.transitions:
     96       all[trans] = all.get(trans, 0) + 1
     97       # We currently don't use the chain type.
     98       # TODO(evanm): make this a command-line option.
     99       # if trans & 0x30000000 != 0:
    100       #   chain = ''
    101       #   if trans & 0x10000000:
    102       #     chain = 'start'
    103       #   if trans & 0x20000000:
    104       #     if len(chain) == 0:
    105       #       chain = 'end'
    106       #     else:
    107       #       chain = ''
    108       #   if len(chain) > 0:
    109       #     edge['chain'] = chain
    110     return all
    111 
    112 
    113 def ClusterBy(objs, pred):
    114   """Group a list of objects by a predicate.
    115 
    116   Given a list of objects and a predicate over the objects, return a
    117   dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
    118   clusters = {}
    119   for obj in objs:
    120     cluster = pred(obj)
    121     clusters[cluster] = clusters.get(cluster, [])
    122     clusters[cluster].append(obj)
    123   return clusters
    124 
    125 
    126 def EscapeDot(string):
    127   """Escape a string suitable for embedding in a graphviz graph."""
    128   # TODO(evanm): this is likely not sufficient.
    129   return string.replace('\n', '\\n')
    130 
    131 
    132 class SQLite(object):
    133   """Trivial interface to executing SQLite queries.
    134   Spawns a new process with each call."""
    135   def __init__(self, file=None):
    136     self.file = file
    137 
    138   def Run(self, sql):
    139     """Execute |sql|, yielding each row of results as an array."""
    140     subproc = subprocess.Popen(['sqlite', self.file],
    141                                stdin=subprocess.PIPE,
    142                                stdout=subprocess.PIPE)
    143     subproc.stdin.write('.mode tabs\n')
    144     subproc.stdin.write(sql + ';')
    145     subproc.stdin.close()
    146     for line in subproc.stdout:
    147       row = line.strip().split('\t')
    148       yield row
    149 
    150 
    151 def LoadHistory(filename):
    152   db = SQLite(filename)
    153 
    154   urls = {}  # Map of urlid => url.
    155   urls['0'] = URL('0', 'start')  # Node name '0' is our special 'start' node.
    156   for id, url in db.Run('SELECT id, url FROM urls'):
    157     urls[id] = URL(id, url)
    158 
    159   visiturlids = {}  # Map of visitid => urlid.
    160   visiturlids['0'] = '0'  # '0' is our special 'start' node.
    161   edges = {}  # Map of urlid->urlid->Edge.
    162   for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
    163                                      'FROM visits ORDER BY id'):
    164     visiturlids[dst] = url
    165     src = visiturlids[src]
    166     dst = visiturlids[dst]
    167     edges[src] = edges.get(src, {})
    168     edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
    169     # SQLite outputs transition values as signed integers, but they're really
    170     # a bitfield.  Below does "unsigned trans = static_cast<unsigned>(trans)".
    171     trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
    172     edge.transitions.append(trans)
    173 
    174   return urls, edges
    175 
    176 
    177 def main():
    178   urls, edges = LoadHistory(sys.argv[1])
    179   print 'digraph G {'
    180   print '  graph [rankdir=LR]'  # Display left to right.
    181   print '  node [shape=box]'    # Display nodes as boxes.
    182   print '  subgraph { rank=source; 0 [label="start"] }'
    183 
    184   # Output all the nodes within graph clusters.
    185   hosts = ClusterBy(urls.values(), lambda url: url.host)
    186   for i, (host, urls) in enumerate(hosts.items()):
    187     # Cluster all URLs under this host if it has more than one entry.
    188     host_clustered = len(urls) > 1
    189     if host_clustered:
    190       print 'subgraph clusterhost%d {' % i
    191       print '  label="%s"' % host
    192     paths = ClusterBy(urls, lambda url: url.path)
    193     for j, (path, urls) in enumerate(paths.items()):
    194       # Cluster all URLs under this host if it has more than one entry.
    195       path_clustered = host_clustered and len(urls) > 1
    196       if path_clustered:
    197         print '  subgraph cluster%d%d {' % (i, j)
    198         print '    label="%s"' % path
    199       for url in urls:
    200         if url.id == '0': continue  # We already output the special start node.
    201         pretty = url.PrettyPrint(include_host=not host_clustered,
    202                                 include_path=not path_clustered)
    203         print '    %s [label="%s"]' % (url.id, EscapeDot(pretty))
    204       if path_clustered:
    205         print '  }'
    206     if host_clustered:
    207       print '}'
    208 
    209   # Output all the edges between nodes.
    210   for src, dsts in edges.items():
    211     for dst, edge in dsts.items():
    212       # Gather up all the transitions into the label.
    213       label = []      # Label for the edge.
    214       transitions = edge.Transitions()
    215       for trans, count in transitions.items():
    216         text = ''
    217         if count > 1:
    218           text = '%dx ' % count
    219         base_type = trans & 0xFF
    220         redir = (trans & 0xC0000000) != 0
    221         start = (trans & 0x10000000) != 0
    222         end = (trans & 0x20000000) != 0
    223         if start or end:
    224           if start:
    225             text += '<'
    226           if end:
    227             text += '>'
    228           text += ' '
    229         if redir:
    230           text += 'R '
    231         text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
    232         label.append(text)
    233       if len(label) == 0:
    234         continue
    235 
    236       edgeattrs = []  # Graphviz attributes for the edge.
    237       # If the edge is from the start and the transitions are fishy, make it
    238       # display as a dotted line.
    239       if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0):
    240         edgeattrs.append('style=dashed')
    241       if len(label) > 0:
    242         edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))
    243 
    244       out = '%s -> %s' % (src, dst)
    245       if len(edgeattrs) > 0:
    246         out += ' [%s]' % ','.join(edgeattrs)
    247       print out
    248   print '}'
    249   return 0
    250 
    251 
    252 if __name__ == '__main__':
    253   sys.exit(main())
    254