Home | History | Annotate | Download | only in consumers
      1 """
      2 Looks for duplicate resource definitions and removes all but the last one.
      3 """
      4 
      5 import os.path
      6 import xml.parsers.expat
      7 
      8 class DuplicateRemover:
      9     def matches(self, file_path):
     10         dirname, basename = os.path.split(file_path)
     11         dirname = os.path.split(dirname)[1]
     12         return dirname.startswith("values") and basename.endswith(".xml")
     13 
     14     def consume(self, xml_path, input):
     15         parser = xml.parsers.expat.ParserCreate("utf-8")
     16         parser.returns_unicode = True
     17         tracker = ResourceDefinitionLocator(parser)
     18         parser.StartElementHandler = tracker.start_element
     19         parser.EndElementHandler = tracker.end_element
     20         parser.Parse(input)
     21 
     22         # Treat the input as UTF-8 or else column numbers will be wrong.
     23         input_lines = input.decode('utf-8').splitlines(True)
     24 
     25         # Extract the duplicate resource definitions, ignoring the last definition
     26         # which will take precedence and be left intact.
     27         duplicates = []
     28         for res_name, entries in tracker.resource_definitions.iteritems():
     29             if len(entries) > 1:
     30                 duplicates += entries[:-1]
     31 
     32         # Sort the duplicates so that they are in order. That way we only do one pass.
     33         duplicates = sorted(duplicates, key=lambda x: x.start)
     34 
     35         last_line_no = 0
     36         last_col_no = 0
     37         output_lines = []
     38         current_line = ""
     39         for definition in duplicates:
     40             print "{0}: removing duplicate resource '{1}'".format(xml_path, definition.name)
     41 
     42             if last_line_no < definition.start[0]:
     43                 # The next definition is on a new line, so write what we have
     44                 # to the output.
     45                 new_line = current_line + input_lines[last_line_no][last_col_no:]
     46                 if not new_line.isspace():
     47                     output_lines.append(new_line)
     48                 current_line = ""
     49                 last_col_no = 0
     50                 last_line_no += 1
     51 
     52             # Copy all the lines up until this one.
     53             for line_to_copy in xrange(last_line_no, definition.start[0]):
     54                 output_lines.append(input_lines[line_to_copy])
     55 
     56             # Add to the existing line we're building, by including the prefix of this line
     57             # and skipping the lines and characters until the end of this duplicate
     58             # definition.
     59             last_line_no = definition.start[0]
     60             current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
     61             last_line_no = definition.end[0]
     62             last_col_no = definition.end[1]
     63 
     64         new_line = current_line + input_lines[last_line_no][last_col_no:]
     65         if not new_line.isspace():
     66             output_lines.append(new_line)
     67         current_line = ""
     68         last_line_no += 1
     69         last_col_no = 0
     70 
     71         for line_to_copy in xrange(last_line_no, len(input_lines)):
     72             output_lines.append(input_lines[line_to_copy])
     73 
     74         if len(duplicates) > 0:
     75             print "deduped {0}".format(xml_path)
     76             return "".join(output_lines).encode("utf-8")
     77         return input
     78 
     79 class Duplicate:
     80     """A small struct to maintain the positions of a Duplicate resource definition."""
     81     def __init__(self, name, product, depth, start, end):
     82         self.name = name
     83         self.product = product
     84         self.depth = depth
     85         self.start = start
     86         self.end = end
     87 
     88 class ResourceDefinitionLocator:
     89     """Callback class for xml.parsers.expat which records resource definitions and their
     90     locations.
     91     """
     92     def __init__(self, parser):
     93         self.resource_definitions = {}
     94         self._parser = parser
     95         self._depth = 0
     96         self._current_resource = None
     97 
     98     def start_element(self, tag_name, attrs):
     99         self._depth += 1
    100         if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
    101             resource_name = None
    102             product = ""
    103             try:
    104                 product = attrs["product"]
    105             except KeyError:
    106                 pass
    107 
    108             if tag_name == "item":
    109                 resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
    110             else:
    111                 resource_name = "{0}/{1}".format(tag_name, attrs["name"])
    112             self._current_resource = Duplicate(
    113                     resource_name,
    114                     product,
    115                     self._depth,
    116                     (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
    117                     None)
    118 
    119     def end_element(self, tag_name):
    120         if self._current_resource and self._depth == self._current_resource.depth:
    121             # Record the end position of the element, which is the length of the name
    122             # plus the </> symbols (len("</>") == 3).
    123             self._current_resource.end = (self._parser.CurrentLineNumber - 1,
    124                     self._parser.CurrentColumnNumber + 3 + len(tag_name))
    125             key_name = "{0}:{1}".format(self._current_resource.name,
    126                     self._current_resource.product)
    127             try:
    128                 self.resource_definitions[key_name] += [self._current_resource]
    129             except KeyError:
    130                 self.resource_definitions[key_name] = [self._current_resource]
    131             self._current_resource = None
    132         self._depth -= 1
    133