Home | History | Annotate | Download | only in histograms
      1 #!/usr/bin/env python
      2 # Copyright 2013 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text
      7 at 80 chars, enforcing standard attribute ordering, and standardizing
      8 indentation.
      9 
     10 This is quite a bit more complicated than just calling tree.toprettyxml();
     11 we need additional customization, like special attribute ordering in tags
     12 and wrapping text nodes, so we implement our own full custom XML pretty-printer.
     13 """
     14 
     15 from __future__ import with_statement
     16 
     17 import diffutil
     18 import json
     19 import logging
     20 import shutil
     21 import sys
     22 import textwrap
     23 import xml.dom.minidom
     24 
     25 
     26 WRAP_COLUMN = 80
     27 
     28 # Desired order for tag attributes; attributes listed here will appear first,
     29 # and in the same order as in these lists.
     30 # { tag_name: [attribute_name, ...] }
     31 ATTRIBUTE_ORDER = {
     32   'enum': ['name', 'type'],
     33   'histogram': ['name', 'enum', 'units'],
     34   'int': ['value', 'label'],
     35   'fieldtrial': ['name', 'separator', 'ordering'],
     36   'group': ['name', 'label'],
     37   'affected-histogram': ['name'],
     38   'with-group': ['name'],
     39 }
     40 
     41 # Tag names for top-level nodes whose children we don't want to indent.
     42 TAGS_THAT_DONT_INDENT = [
     43   'histogram-configuration',
     44   'histograms',
     45   'fieldtrials',
     46   'enums'
     47 ]
     48 
     49 # Extra vertical spacing rules for special tag names.
     50 # {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)}
     51 TAGS_THAT_HAVE_EXTRA_NEWLINE = {
     52   'histogram-configuration': (2, 1, 1),
     53   'histograms': (2, 1, 1),
     54   'fieldtrials': (2, 1, 1),
     55   'enums': (2, 1, 1),
     56   'histogram': (1, 1, 1),
     57   'enum': (1, 1, 1),
     58   'fieldtrial': (1, 1, 1),
     59 }
     60 
     61 # Tags that we allow to be squished into a single line for brevity.
     62 TAGS_THAT_ALLOW_SINGLE_LINE = [
     63   'summary',
     64   'int',
     65 ]
     66 
     67 # Tags whose children we want to alphabetize. The key is the parent tag name,
     68 # and the value is a pair of the tag name of the children we want to sort,
     69 # and a key function that maps each child node to the desired sort key.
     70 ALPHABETIZATION_RULES = {
     71   'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()),
     72   'enums': ('enum', lambda n: n.attributes['name'].value.lower()),
     73   'enum': ('int', lambda n: int(n.attributes['value'].value)),
     74   'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()),
     75   'fieldtrial': ('affected-histogram',
     76                  lambda n: n.attributes['name'].value.lower()),
     77 }
     78 
     79 
     80 class Error(Exception):
     81   pass
     82 
     83 
     84 def LastLineLength(s):
     85   """Returns the length of the last line in s.
     86 
     87   Args:
     88     s: A multi-line string, including newlines.
     89 
     90   Returns:
     91     The length of the last line in s, in characters.
     92   """
     93   if s.rfind('\n') == -1: return len(s)
     94   return len(s) - s.rfind('\n') - len('\n')
     95 
     96 
     97 def XmlEscape(s):
     98   """XML-escapes the given string, replacing magic characters (&<>") with their
     99   escaped equivalents."""
    100   s = s.replace("&", "&amp;").replace("<", "&lt;")
    101   s = s.replace("\"", "&quot;").replace(">", "&gt;")
    102   return s
    103 
    104 
    105 def PrettyPrintNode(node, indent=0):
    106   """Pretty-prints the given XML node at the given indent level.
    107 
    108   Args:
    109     node: The minidom node to pretty-print.
    110     indent: The current indent level.
    111 
    112   Returns:
    113     The pretty-printed string (including embedded newlines).
    114 
    115   Raises:
    116     Error if the XML has unknown tags or attributes.
    117   """
    118   # Handle the top-level document node.
    119   if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
    120     return '\n'.join([PrettyPrintNode(n) for n in node.childNodes])
    121 
    122   # Handle text nodes.
    123   if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
    124     # Wrap each paragraph in the text to fit in the 80 column limit.
    125     wrapper = textwrap.TextWrapper()
    126     wrapper.initial_indent = ' ' * indent
    127     wrapper.subsequent_indent = ' ' * indent
    128     wrapper.break_on_hyphens = False
    129     wrapper.break_long_words = False
    130     wrapper.width = WRAP_COLUMN
    131     text = XmlEscape(node.data)
    132     # Remove any common indent.
    133     text = textwrap.dedent(text.strip('\n'))
    134     lines = text.split('\n')
    135     # Split the text into paragraphs at blank line boundaries.
    136     paragraphs = [[]]
    137     for l in lines:
    138       if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
    139         paragraphs.append([])
    140       else:
    141         paragraphs[-1].append(l)
    142     # Remove trailing empty paragraph if present.
    143     if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
    144       paragraphs = paragraphs[:-1]
    145     # Wrap each paragraph and separate with two newlines.
    146     return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
    147 
    148   # Handle element nodes.
    149   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
    150     newlines_after_open, newlines_before_close, newlines_after_close = (
    151       TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0)))
    152     # Open the tag.
    153     s = ' ' * indent + '<' + node.tagName
    154 
    155     # Calculate how much space to allow for the '>' or '/>'.
    156     closing_chars = 1
    157     if not node.childNodes:
    158       closing_chars = 2
    159 
    160     # Pretty-print the attributes.
    161     attributes = node.attributes.keys()
    162     if attributes:
    163       # Reorder the attributes.
    164       if not node.tagName in ATTRIBUTE_ORDER:
    165         unrecognized_attributes = attributes;
    166       else:
    167         unrecognized_attributes = (
    168           [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]])
    169         attributes = (
    170           [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes])
    171 
    172       for a in unrecognized_attributes:
    173         logging.error(
    174             'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
    175       if unrecognized_attributes:
    176         raise Error()
    177 
    178       for a in attributes:
    179         value = XmlEscape(node.attributes[a].value)
    180         # Replace sequences of whitespace with single spaces.
    181         words = value.split()
    182         a_str = ' %s="%s"' % (a, ' '.join(words))
    183         # Start a new line if the attribute will make this line too long.
    184         if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
    185           s += '\n' + ' ' * (indent + 3)
    186         # Output everything up to the first quote.
    187         s += ' %s="' % (a)
    188         value_indent_level = LastLineLength(s)
    189         # Output one word at a time, splitting to the next line where necessary.
    190         column = value_indent_level
    191         for i, word in enumerate(words):
    192           # This is slightly too conservative since not every word will be
    193           # followed by the closing characters...
    194           if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
    195             s = s.rstrip()  # remove any trailing whitespace
    196             s += '\n' + ' ' * value_indent_level
    197             column = value_indent_level
    198           s += word + ' '
    199           column += len(word) + 1
    200         s = s.rstrip()  # remove any trailing whitespace
    201         s += '"'
    202       s = s.rstrip()  # remove any trailing whitespace
    203 
    204     # Pretty-print the child nodes.
    205     if node.childNodes:
    206       s += '>'
    207       # Calculate the new indent level for child nodes.
    208       new_indent = indent
    209       if node.tagName not in TAGS_THAT_DONT_INDENT:
    210         new_indent += 2
    211       child_nodes = node.childNodes
    212 
    213       # Recursively pretty-print the child nodes.
    214       child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes]
    215       child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
    216 
    217       # Determine whether we can fit the entire node on a single line.
    218       close_tag = '</%s>' % node.tagName
    219       space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
    220       if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and
    221           len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
    222         s += child_nodes[0].strip()
    223       else:
    224         s += '\n' * newlines_after_open + '\n'.join(child_nodes)
    225         s += '\n' * newlines_before_close + ' ' * indent
    226       s += close_tag
    227     else:
    228       s += '/>'
    229     s += '\n' * newlines_after_close
    230     return s
    231 
    232   # Handle comment nodes.
    233   if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
    234     return '<!--%s-->\n' % node.data
    235 
    236   # Ignore other node types. This could be a processing instruction (<? ... ?>)
    237   # or cdata section (<![CDATA[...]]!>), neither of which are legal in the
    238   # histograms XML at present.
    239   logging.error('Ignoring unrecognized node data: %s' % node.toxml())
    240   raise Error()
    241 
    242 
    243 def unsafeAppendChild(parent, child):
    244   """Append child to parent's list of children, ignoring the possibility that it
    245   is already in another node's childNodes list.  Requires that the previous
    246   parent of child is discarded (to avoid non-tree DOM graphs).
    247   This can provide a significant speedup as O(n^2) operations are removed (in
    248   particular, each child insertion avoids the need to traverse the old parent's
    249   entire list of children)."""
    250   child.parentNode = None
    251   parent.appendChild(child)
    252   child.parentNode = parent
    253 
    254 
    255 def TransformByAlphabetizing(node):
    256   """Transform the given XML by alphabetizing specific node types according to
    257   the rules in ALPHABETIZATION_RULES.
    258 
    259   Args:
    260     node: The minidom node to transform.
    261 
    262   Returns:
    263     The minidom node, with children appropriately alphabetized. Note that the
    264     transformation is done in-place, i.e. the original minidom tree is modified
    265     directly.
    266   """
    267   if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
    268     for c in node.childNodes: TransformByAlphabetizing(c)
    269     return node
    270 
    271   # Element node with a tag name that we alphabetize the children of?
    272   if node.tagName in ALPHABETIZATION_RULES:
    273     # Put subnodes in a list of node,key pairs to allow for custom sorting.
    274     subtag, key_function = ALPHABETIZATION_RULES[node.tagName]
    275     subnodes = []
    276     last_key = -1
    277     for c in node.childNodes:
    278       if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and
    279           c.tagName == subtag):
    280         last_key = key_function(c)
    281       # Subnodes that we don't want to rearrange use the last node's key,
    282       # so they stay in the same relative position.
    283       subnodes.append( (c, last_key) )
    284 
    285     # Sort the subnode list.
    286     subnodes.sort(key=lambda pair: pair[1])
    287 
    288     # Re-add the subnodes, transforming each recursively.
    289     while node.firstChild:
    290       node.removeChild(node.firstChild)
    291     for (c, _) in subnodes:
    292       unsafeAppendChild(node, TransformByAlphabetizing(c))
    293     return node
    294 
    295   # Recursively handle other element nodes and other node types.
    296   for c in node.childNodes: TransformByAlphabetizing(c)
    297   return node
    298 
    299 
    300 def PrettyPrint(raw_xml):
    301   """Pretty-print the given XML.
    302 
    303   Args:
    304     xml: The contents of the histograms XML file, as a string.
    305 
    306   Returns:
    307     The pretty-printed version.
    308   """
    309   tree = xml.dom.minidom.parseString(raw_xml)
    310   tree = TransformByAlphabetizing(tree)
    311   return PrettyPrintNode(tree)
    312 
    313 
    314 def main():
    315   logging.basicConfig(level=logging.INFO)
    316 
    317   presubmit = ('--presubmit' in sys.argv)
    318 
    319   logging.info('Loading histograms.xml...')
    320   with open('histograms.xml', 'rb') as f:
    321     xml = f.read()
    322 
    323   # Check there are no CR ('\r') characters in the file.
    324   if '\r' in xml:
    325     logging.info('DOS-style line endings (CR characters) detected - these are '
    326                  'not allowed. Please run dos2unix histograms.xml')
    327     sys.exit(1)
    328 
    329   logging.info('Pretty-printing...')
    330   try:
    331     pretty = PrettyPrint(xml)
    332   except Error:
    333     logging.error('Aborting parsing due to fatal errors.')
    334     sys.exit(1)
    335 
    336   if xml == pretty:
    337     logging.info('histograms.xml is correctly pretty-printed.')
    338     sys.exit(0)
    339   if presubmit:
    340     logging.info('histograms.xml is not formatted correctly; run '
    341                  'pretty_print.py to fix.')
    342     sys.exit(1)
    343   if not diffutil.PromptUserToAcceptDiff(
    344       xml, pretty,
    345       'Is the prettified version acceptable?'):
    346     logging.error('Aborting')
    347     return
    348 
    349   logging.info('Creating backup file histograms.before.pretty-print.xml')
    350   shutil.move('histograms.xml', 'histograms.before.pretty-print.xml')
    351 
    352   logging.info('Writing new histograms.xml file')
    353   with open('histograms.xml', 'wb') as f:
    354     f.write(pretty)
    355 
    356 
    357 if __name__ == '__main__':
    358   main()
    359