Home | History | Annotate | Download | only in gather
      1 #!/usr/bin/env python
      2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 '''A gatherer for the TotalRecall brand of HTML templates with replaceable
      7 portions.  We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser
      8 but this proved impossible due to the fact that the TotalRecall HTML templates
      9 are in general quite far from parseable HTML and the TCHTMLParser derives
     10 from HTMLParser.HTMLParser which requires relatively well-formed HTML.  Some
     11 examples of "HTML" from the TotalRecall HTML templates that wouldn't be
     12 parseable include things like:
     13 
     14   <a [PARAMS]>blabla</a>  (not parseable because attributes are invalid)
     15 
     16   <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing
     17                                             </td> is in the HTML [LOTSOFSTUFF]
     18                                             is replaced by)
     19 
     20 The other problem with using general parsers (such as TCHTMLParser) is that
     21 we want to make sure we output the TotalRecall template with as little changes
     22 as possible in terms of whitespace characters, layout etc.  With any parser
     23 that generates a parse tree, and generates output by dumping the parse tree,
     24 we would always have little inconsistencies which could cause bugs (the
     25 TotalRecall template stuff is quite brittle and can break if e.g. a tab
     26 character is replaced with spaces).
     27 
     28 The solution, which may be applicable to some other HTML-like template
     29 languages floating around Google, is to create a parser with a simple state
     30 machine that keeps track of what kind of tag it's inside, and whether it's in
     31 a translateable section or not.  Translateable sections are:
     32 
     33 a) text (including [BINGO] replaceables) inside of tags that
     34    can contain translateable text (which is all tags except
     35    for a few)
     36 
     37 b) text inside of an 'alt' attribute in an <image> element, or
     38    the 'value' attribute of a <submit>, <button> or <text>
     39    element.
     40 
     41 The parser does not build up a parse tree but rather a "skeleton" which
     42 is a list of nontranslateable strings intermingled with grit.clique.MessageClique
     43 objects.  This simplifies the parser considerably compared to a regular HTML
     44 parser.  To output a translated document, each item in the skeleton is
     45 printed out, with the relevant Translation from each MessageCliques being used
     46 for the requested language.
     47 
     48 This implementation borrows some code, constants and ideas from
     49 extern.tclib.api.handlers.html.TCHTMLParser.
     50 '''
     51 
     52 
     53 import re
     54 import types
     55 
     56 from grit import clique
     57 from grit import exception
     58 from grit import lazy_re
     59 from grit import util
     60 from grit import tclib
     61 
     62 from grit.gather import interface
     63 
     64 
     65 # HTML tags which break (separate) chunks.
     66 _BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br',
     67               'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th',
     68               'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center',
     69               'html', 'link', 'form', 'select', 'textarea',
     70               'button', 'option', 'map', 'area', 'blockquote', 'pre',
     71               'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead',
     72               'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap',
     73               'fieldset', 'legend']
     74 
     75 # HTML tags which may appear within a chunk.
     76 _INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small',
     77                'key', 'nobr', 'url', 'em', 's', 'sup', 'strike',
     78                'strong']
     79 
     80 # HTML tags within which linebreaks are significant.
     81 _PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre']
     82 
     83 # An array mapping some of the inline HTML tags to more meaningful
     84 # names for those tags.  This will be used when generating placeholders
     85 # representing these tags.
     86 _HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold',
     87   'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph',
     88   'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' }
     89 
     90 # We append each of these characters in sequence to distinguish between
     91 # different placeholders with basically the same name (e.g. BOLD1, BOLD2).
     92 # Keep in mind that a placeholder name must not be a substring of any other
     93 # placeholder name in the same message, so we can't simply count (BOLD_1
     94 # would be a substring of BOLD_10).
     95 _SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     96 
     97 # Matches whitespace in an HTML document.  Also matches HTML comments, which are
     98 # treated as whitespace.
     99 _WHITESPACE = lazy_re.compile(r'(\s|&nbsp;|\\n|\\r|<!--\s*desc\s*=.*?-->)+',
    100                               re.DOTALL)
    101 
    102 # Matches whitespace sequences which can be folded into a single whitespace
    103 # character.  This matches single characters so that non-spaces are replaced
    104 # with spaces.
    105 _FOLD_WHITESPACE = lazy_re.compile(r'\s+')
    106 
    107 # Finds a non-whitespace character
    108 _NON_WHITESPACE = lazy_re.compile(r'\S')
    109 
    110 # Matches two or more &nbsp; in a row (a single &nbsp is not changed into
    111 # placeholders because different languages require different numbers of spaces
    112 # and placeholders must match exactly; more than one is probably a "special"
    113 # whitespace sequence and should be turned into a placeholder).
    114 _NBSP = lazy_re.compile(r'&nbsp;(&nbsp;)+')
    115 
    116 # Matches nontranslateable chunks of the document
    117 _NONTRANSLATEABLES = lazy_re.compile(r'''
    118   <\s*script.+?<\s*/\s*script\s*>
    119   |
    120   <\s*style.+?<\s*/\s*style\s*>
    121   |
    122   <!--.+?-->
    123   |
    124   <\?IMPORT\s.+?>           # import tag
    125   |
    126   <\s*[a-zA-Z_]+:.+?>       # custom tag (open)
    127   |
    128   <\s*/\s*[a-zA-Z_]+:.+?>   # custom tag (close)
    129   |
    130   <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?>
    131   ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
    132 
    133 # Matches a tag and its attributes
    134 _ELEMENT = lazy_re.compile(r'''
    135   # Optional closing /, element name
    136   <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s*
    137   # Attributes and/or replaceables inside the tag, if any
    138   (?P<atts>(
    139     \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name
    140     (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
    141     |
    142     \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\]
    143   )*)
    144   \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close
    145   ''',
    146   re.MULTILINE | re.DOTALL | re.VERBOSE)
    147 
    148 # Matches elements that may have translateable attributes.  The value of these
    149 # special attributes is given by group 'value1' or 'value2'.  Note that this
    150 # regexp demands that the attribute value be quoted; this is necessary because
    151 # the non-tree-building nature of the parser means we don't know when we're
    152 # writing out attributes, so we wouldn't know to escape spaces.
    153 _SPECIAL_ELEMENT = lazy_re.compile(r'''
    154   <\s*(
    155     input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)")
    156     [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"?
    157     |
    158     (
    159       table[^>]+?title\s*=
    160       |
    161       img[^>]+?alt\s*=
    162       |
    163       input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*=
    164     )
    165     \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)")
    166   )[^>]*?>
    167   ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE)
    168 
    169 # Matches stuff that is translateable if it occurs in the right context
    170 # (between tags).  This includes all characters and character entities.
    171 # Note that this also matches &nbsp; which needs to be handled as whitespace
    172 # before this regexp is applied.
    173 _CHARACTERS = lazy_re.compile(r'''
    174   (
    175     \w
    176     |
    177     [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~]
    178     |
    179     &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+);
    180   )+
    181   ''', re.MULTILINE | re.DOTALL | re.VERBOSE)
    182 
    183 # Matches Total Recall's "replaceable" tags, which are just any text
    184 # in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO],
    185 # [~HELLO~] and [$~HELLO~$]).
    186 _REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]',
    187                                re.MULTILINE)
    188 
    189 
    190 # Matches the silly [!]-prefixed "header" that is used in some TotalRecall
    191 # templates.
    192 _SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n',
    193                                 re.MULTILINE | re.DOTALL)
    194 
    195 
    196 # Matches a comment that provides a description for the message it occurs in.
    197 _DESCRIPTION_COMMENT = lazy_re.compile(
    198   r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL)
    199 
    200 # Matches a comment which is used to break apart multiple messages.
    201 _MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->',
    202                                          re.DOTALL)
    203 
    204 # Matches a comment which is used to prevent block tags from splitting a message
    205 _MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->',
    206                                        re.DOTALL)
    207 
    208 
    209 _DEBUG = 0
    210 def _DebugPrint(text):
    211   if _DEBUG:
    212     print text.encode('utf-8')
    213 
    214 
    215 class HtmlChunks(object):
    216   '''A parser that knows how to break an HTML-like document into a list of
    217   chunks, where each chunk is either translateable or non-translateable.
    218   The chunks are unmodified sections of the original document, so concatenating
    219   the text of all chunks would result in the original document.'''
    220 
    221   def InTranslateable(self):
    222     return self.last_translateable != -1
    223 
    224   def Rest(self):
    225     return self.text_[self.current:]
    226 
    227   def StartTranslateable(self):
    228     assert not self.InTranslateable()
    229     if self.current != 0:
    230       # Append a nontranslateable chunk
    231       chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1]
    232       # Needed in the case where document starts with a translateable.
    233       if len(chunk_text) > 0:
    234         self.AddChunk(False, chunk_text)
    235     self.chunk_start = self.last_nontranslateable + 1
    236     self.last_translateable = self.current
    237     self.last_nontranslateable = -1
    238 
    239   def EndTranslateable(self):
    240     assert self.InTranslateable()
    241     # Append a translateable chunk
    242     self.AddChunk(True,
    243                   self.text_[self.chunk_start : self.last_translateable + 1])
    244     self.chunk_start = self.last_translateable + 1
    245     self.last_translateable = -1
    246     self.last_nontranslateable = self.current
    247 
    248   def AdvancePast(self, match):
    249     self.current += match.end()
    250 
    251   def AddChunk(self, translateable, text):
    252     '''Adds a chunk to self, removing linebreaks and duplicate whitespace
    253     if appropriate.
    254     '''
    255     m = _DESCRIPTION_COMMENT.search(text)
    256     if m:
    257       self.last_description = m.group('description')
    258       # Remove the description from the output text
    259       text = _DESCRIPTION_COMMENT.sub('', text)
    260 
    261     m = _MESSAGE_BREAK_COMMENT.search(text)
    262     if m:
    263       # Remove the coment from the output text.  It should already effectively
    264       # break apart messages.
    265       text = _MESSAGE_BREAK_COMMENT.sub('', text)
    266 
    267     if translateable and not self.last_element_ in _PREFORMATTED_TAGS:
    268       if self.fold_whitespace_:
    269         # Fold whitespace sequences if appropriate.  This is optional because it
    270         # alters the output strings.
    271         text = _FOLD_WHITESPACE.sub(' ', text)
    272       else:
    273         text = text.replace('\n', ' ')
    274         text = text.replace('\r', ' ')
    275         # This whitespace folding doesn't work in all cases, thus the
    276         # fold_whitespace flag to support backwards compatibility.
    277         text = text.replace('   ', ' ')
    278         text = text.replace('  ', ' ')
    279 
    280     if translateable:
    281       description = self.last_description
    282       self.last_description = ''
    283     else:
    284       description = ''
    285 
    286     if text != '':
    287       self.chunks_.append((translateable, text, description))
    288 
    289   def Parse(self, text, fold_whitespace):
    290     '''Parses self.text_ into an intermediate format stored in self.chunks_
    291     which is translateable and nontranslateable chunks.  Also returns
    292     self.chunks_
    293 
    294     Args:
    295       text: The HTML for parsing.
    296       fold_whitespace: Whether whitespace sequences should be folded into a
    297         single space.
    298 
    299     Return:
    300       [chunk1, chunk2, chunk3, ...]  (instances of class Chunk)
    301     '''
    302     #
    303     # Chunker state
    304     #
    305 
    306     self.text_ = text
    307     self.fold_whitespace_ = fold_whitespace
    308 
    309     # A list of tuples (is_translateable, text) which represents the document
    310     # after chunking.
    311     self.chunks_ = []
    312 
    313     # Start index of the last chunk, whether translateable or not
    314     self.chunk_start = 0
    315 
    316     # Index of the last for-sure translateable character if we are parsing
    317     # a translateable chunk, -1 to indicate we are not in a translateable chunk.
    318     # This is needed so that we don't include trailing whitespace in the
    319     # translateable chunk (whitespace is neutral).
    320     self.last_translateable = -1
    321 
    322     # Index of the last for-sure nontranslateable character if we are parsing
    323     # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk.
    324     # This is needed to make sure we can group e.g. "<b>Hello</b> there"
    325     # together instead of just "Hello</b> there" which would be much worse
    326     # for translation.
    327     self.last_nontranslateable = -1
    328 
    329     # Index of the character we're currently looking at.
    330     self.current = 0
    331 
    332     # The name of the last block element parsed.
    333     self.last_element_ = ''
    334 
    335     # The last explicit description we found.
    336     self.last_description = ''
    337 
    338     # Whether no-break was the last chunk seen
    339     self.last_nobreak = False
    340 
    341     while self.current < len(self.text_):
    342       _DebugPrint('REST: %s' % self.text_[self.current:self.current+60])
    343 
    344       m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest())
    345       if m:
    346         self.AdvancePast(m)
    347         self.last_nobreak = True
    348         continue
    349 
    350       # Try to match whitespace
    351       m = _WHITESPACE.match(self.Rest())
    352       if m:
    353         # Whitespace is neutral, it just advances 'current' and does not switch
    354         # between translateable/nontranslateable.  If we are in a
    355         # nontranslateable section that extends to the current point, we extend
    356         # it to include the whitespace.  If we are in a translateable section,
    357         # we do not extend it until we find
    358         # more translateable parts, because we never want a translateable chunk
    359         # to end with whitespace.
    360         if (not self.InTranslateable() and
    361             self.last_nontranslateable == self.current - 1):
    362           self.last_nontranslateable = self.current + m.end() - 1
    363         self.AdvancePast(m)
    364         continue
    365 
    366       # Then we try to match nontranslateables
    367       m = _NONTRANSLATEABLES.match(self.Rest())
    368       if m:
    369         if self.InTranslateable():
    370           self.EndTranslateable()
    371         self.last_nontranslateable = self.current + m.end() - 1
    372         self.AdvancePast(m)
    373         continue
    374 
    375       # Now match all other HTML element tags (opening, closing, or empty, we
    376       # don't care).
    377       m = _ELEMENT.match(self.Rest())
    378       if m:
    379         element_name = m.group('element').lower()
    380         if element_name in _BLOCK_TAGS:
    381           self.last_element_ = element_name
    382           if self.InTranslateable():
    383             if self.last_nobreak:
    384               self.last_nobreak = False
    385             else:
    386               self.EndTranslateable()
    387 
    388           # Check for "special" elements, i.e. ones that have a translateable
    389           # attribute, and handle them correctly.  Note that all of the
    390           # "special" elements are block tags, so no need to check for this
    391           # if the tag is not a block tag.
    392           sm = _SPECIAL_ELEMENT.match(self.Rest())
    393           if sm:
    394             # Get the appropriate group name
    395             for group in sm.groupdict().keys():
    396               if sm.groupdict()[group]:
    397                 break
    398 
    399             # First make a nontranslateable chunk up to and including the
    400             # quote before the translateable attribute value
    401             self.AddChunk(False, self.text_[
    402               self.chunk_start : self.current + sm.start(group)])
    403             # Then a translateable for the translateable bit
    404             self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)])
    405             # Finally correct the data invariant for the parser
    406             self.chunk_start = self.current + sm.end(group)
    407 
    408           self.last_nontranslateable = self.current + m.end() - 1
    409         elif self.InTranslateable():
    410           # We're in a translateable and the tag is an inline tag, so we
    411           # need to include it in the translateable.
    412           self.last_translateable = self.current + m.end() - 1
    413         self.AdvancePast(m)
    414         continue
    415 
    416       # Anything else we find must be translateable, so we advance one character
    417       # at a time until one of the above matches.
    418       if not self.InTranslateable():
    419         self.StartTranslateable()
    420       else:
    421         self.last_translateable = self.current
    422       self.current += 1
    423 
    424     # Close the final chunk
    425     if self.InTranslateable():
    426       self.AddChunk(True, self.text_[self.chunk_start : ])
    427     else:
    428       self.AddChunk(False, self.text_[self.chunk_start : ])
    429 
    430     return self.chunks_
    431 
    432 
    433 def HtmlToMessage(html, include_block_tags=False, description=''):
    434   '''Takes a bit of HTML, which must contain only "inline" HTML elements,
    435   and changes it into a tclib.Message.  This involves escaping any entities and
    436   replacing any HTML code with placeholders.
    437 
    438   If include_block_tags is true, no error will be given if block tags (e.g.
    439   <p> or <br>) are included in the HTML.
    440 
    441   Args:
    442     html: 'Hello <b>[USERNAME]</b>, how&nbsp;<i>are</i> you?'
    443     include_block_tags: False
    444 
    445   Return:
    446     tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, '
    447                   'howNBSPSTART_ITALICareEND_ITALIC you?',
    448                   [ Placeholder('START_BOLD', '<b>', ''),
    449                     Placeholder('USERNAME', '[USERNAME]', ''),
    450                     Placeholder('END_BOLD', '</b>', ''),
    451                     Placeholder('START_ITALIC', '<i>', ''),
    452                     Placeholder('END_ITALIC', '</i>', ''), ])
    453   '''
    454   # Approach is:
    455   # - first placeholderize, finding <elements>, [REPLACEABLES] and &nbsp;
    456   # - then escape all character entities in text in-between placeholders
    457 
    458   parts = []  # List of strings (for text chunks) and tuples (ID, original)
    459               # for placeholders
    460 
    461   count_names = {}  # Map of base names to number of times used
    462   end_names = {}  # Map of base names to stack of end tags (for correct nesting)
    463 
    464   def MakeNameClosure(base, type = ''):
    465     '''Returns a closure that can be called once all names have been allocated
    466     to return the final name of the placeholder.  This allows us to minimally
    467     number placeholders for non-overlap.
    468 
    469     Also ensures that END_XXX_Y placeholders have the same Y as the
    470     corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same
    471     type.
    472 
    473     Args:
    474       base: 'phname'
    475       type: '' | 'begin' | 'end'
    476 
    477     Return:
    478       Closure()
    479     '''
    480     name = base.upper()
    481     if type != '':
    482       name = ('%s_%s' % (type, base)).upper()
    483 
    484     if name in count_names.keys():
    485       count_names[name] += 1
    486     else:
    487       count_names[name] = 1
    488 
    489     def MakeFinalName(name_ = name, index = count_names[name] - 1):
    490       if (type.lower() == 'end' and
    491           base in end_names.keys() and len(end_names[base])):
    492         return end_names[base].pop(-1)  # For correct nesting
    493       if count_names[name_] != 1:
    494         name_ = '%s_%s' % (name_, _SUFFIXES[index])
    495         # We need to use a stack to ensure that the end-tag suffixes match
    496         # the begin-tag suffixes.  Only needed when more than one tag of the
    497         # same type.
    498         if type == 'begin':
    499           end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper()
    500           if base in end_names.keys():
    501             end_names[base].append(end_name)
    502           else:
    503             end_names[base] = [end_name]
    504 
    505       return name_
    506 
    507     return MakeFinalName
    508 
    509   current = 0
    510   last_nobreak = False
    511 
    512   while current < len(html):
    513     m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:])
    514     if m:
    515       last_nobreak = True
    516       current += m.end()
    517       continue
    518 
    519     m = _NBSP.match(html[current:])
    520     if m:
    521       parts.append((MakeNameClosure('SPACE'), m.group()))
    522       current += m.end()
    523       continue
    524 
    525     m = _REPLACEABLE.match(html[current:])
    526     if m:
    527       # Replaceables allow - but placeholders don't, so replace - with _
    528       ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_'))
    529       parts.append((ph_name, m.group()))
    530       current += m.end()
    531       continue
    532 
    533     m = _SPECIAL_ELEMENT.match(html[current:])
    534     if m:
    535       if not include_block_tags:
    536         if last_nobreak:
    537           last_nobreak = False
    538         else:
    539           raise exception.BlockTagInTranslateableChunk(html)
    540       element_name = 'block'  # for simplification
    541       # Get the appropriate group name
    542       for group in m.groupdict().keys():
    543         if m.groupdict()[group]:
    544           break
    545       parts.append((MakeNameClosure(element_name, 'begin'),
    546                     html[current : current + m.start(group)]))
    547       parts.append(m.group(group))
    548       parts.append((MakeNameClosure(element_name, 'end'),
    549                     html[current + m.end(group) : current + m.end()]))
    550       current += m.end()
    551       continue
    552 
    553     m = _ELEMENT.match(html[current:])
    554     if m:
    555       element_name = m.group('element').lower()
    556       if not include_block_tags and not element_name in _INLINE_TAGS:
    557         if last_nobreak:
    558           last_nobreak = False
    559         else:
    560           raise exception.BlockTagInTranslateableChunk(html[current:])
    561       if element_name in _HTML_PLACEHOLDER_NAMES:  # use meaningful names
    562         element_name = _HTML_PLACEHOLDER_NAMES[element_name]
    563 
    564       # Make a name for the placeholder
    565       type = ''
    566       if not m.group('empty'):
    567         if m.group('closing'):
    568           type = 'end'
    569         else:
    570           type = 'begin'
    571       parts.append((MakeNameClosure(element_name, type), m.group()))
    572       current += m.end()
    573       continue
    574 
    575     if len(parts) and isinstance(parts[-1], types.StringTypes):
    576       parts[-1] += html[current]
    577     else:
    578       parts.append(html[current])
    579     current += 1
    580 
    581   msg_text = ''
    582   placeholders = []
    583   for part in parts:
    584     if isinstance(part, types.TupleType):
    585       final_name = part[0]()
    586       original = part[1]
    587       msg_text += final_name
    588       placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)'))
    589     else:
    590       msg_text += part
    591 
    592   msg = tclib.Message(text=msg_text, placeholders=placeholders,
    593                       description=description)
    594   content = msg.GetContent()
    595   for ix in range(len(content)):
    596     if isinstance(content[ix], types.StringTypes):
    597       content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False)
    598 
    599   return msg
    600 
    601 
    602 class TrHtml(interface.GathererBase):
    603   '''Represents a document or message in the template format used by
    604   Total Recall for HTML documents.'''
    605 
    606   def __init__(self, *args, **kwargs):
    607     super(TrHtml, self).__init__(*args, **kwargs)
    608     self.have_parsed_ = False
    609     self.skeleton_ = []  # list of strings and MessageClique objects
    610     self.fold_whitespace_ = False
    611 
    612   def SetAttributes(self, attrs):
    613     '''Sets node attributes used by the gatherer.
    614 
    615     This checks the fold_whitespace attribute.
    616 
    617     Args:
    618       attrs: The mapping of node attributes.
    619     '''
    620     self.fold_whitespace_ = ('fold_whitespace' in attrs and
    621                              attrs['fold_whitespace'] == 'true')
    622 
    623   def GetText(self):
    624     '''Returns the original text of the HTML document'''
    625     return self.text_
    626 
    627   def GetTextualIds(self):
    628     return [self.extkey]
    629 
    630   def GetCliques(self):
    631     '''Returns the message cliques for each translateable message in the
    632     document.'''
    633     return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)]
    634 
    635   def Translate(self, lang, pseudo_if_not_available=True,
    636                 skeleton_gatherer=None, fallback_to_english=False):
    637     '''Returns this document with translateable messages filled with
    638     the translation for language 'lang'.
    639 
    640     Args:
    641       lang: 'en'
    642       pseudo_if_not_available: True
    643 
    644     Return:
    645       'ID_THIS_SECTION TYPE\n...BEGIN\n  "Translated message"\n......\nEND
    646 
    647     Raises:
    648       grit.exception.NotReady() if used before Parse() has been successfully
    649       called.
    650       grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false
    651       and there is no translation for the requested language.
    652     '''
    653     if len(self.skeleton_) == 0:
    654       raise exception.NotReady()
    655 
    656     # TODO(joi) Implement support for skeleton gatherers here.
    657 
    658     out = []
    659     for item in self.skeleton_:
    660       if isinstance(item, types.StringTypes):
    661         out.append(item)
    662       else:
    663         msg = item.MessageForLanguage(lang,
    664                                       pseudo_if_not_available,
    665                                       fallback_to_english)
    666         for content in msg.GetContent():
    667           if isinstance(content, tclib.Placeholder):
    668             out.append(content.GetOriginal())
    669           else:
    670             # We escape " characters to increase the chance that attributes
    671             # will be properly escaped.
    672             out.append(util.EscapeHtml(content, True))
    673 
    674     return ''.join(out)
    675 
    676   def Parse(self):
    677     if self.have_parsed_:
    678       return
    679     self.have_parsed_ = True
    680 
    681     text = self._LoadInputFile()
    682 
    683     # Ignore the BOM character if the document starts with one.
    684     if text.startswith(u'\ufeff'):
    685       text = text[1:]
    686 
    687     self.text_ = text
    688 
    689     # Parsing is done in two phases:  First, we break the document into
    690     # translateable and nontranslateable chunks.  Second, we run through each
    691     # translateable chunk and insert placeholders for any HTML elements,
    692     # unescape escaped characters, etc.
    693 
    694     # First handle the silly little [!]-prefixed header because it's not
    695     # handled by our HTML parsers.
    696     m = _SILLY_HEADER.match(text)
    697     if m:
    698       self.skeleton_.append(text[:m.start('title')])
    699       self.skeleton_.append(self.uberclique.MakeClique(
    700         tclib.Message(text=text[m.start('title'):m.end('title')])))
    701       self.skeleton_.append(text[m.end('title') : m.end()])
    702       text = text[m.end():]
    703 
    704     chunks = HtmlChunks().Parse(text, self.fold_whitespace_)
    705 
    706     for chunk in chunks:
    707       if chunk[0]:  # Chunk is translateable
    708         self.skeleton_.append(self.uberclique.MakeClique(
    709           HtmlToMessage(chunk[1], description=chunk[2])))
    710       else:
    711         self.skeleton_.append(chunk[1])
    712 
    713     # Go through the skeleton and change any messages that consist solely of
    714     # placeholders and whitespace into nontranslateable strings.
    715     for ix in range(len(self.skeleton_)):
    716       got_text = False
    717       if isinstance(self.skeleton_[ix], clique.MessageClique):
    718         msg = self.skeleton_[ix].GetMessage()
    719         for item in msg.GetContent():
    720           if (isinstance(item, types.StringTypes) and _NON_WHITESPACE.search(item)
    721               and item != '&nbsp;'):
    722             got_text = True
    723             break
    724         if not got_text:
    725           self.skeleton_[ix] = msg.GetRealContent()
    726 
    727   def SubstituteMessages(self, substituter):
    728     '''Applies substitutions to all messages in the tree.
    729 
    730     Goes through the skeleton and finds all MessageCliques.
    731 
    732     Args:
    733       substituter: a grit.util.Substituter object.
    734     '''
    735     new_skel = []
    736     for chunk in self.skeleton_:
    737       if isinstance(chunk, clique.MessageClique):
    738         old_message = chunk.GetMessage()
    739         new_message = substituter.SubstituteMessage(old_message)
    740         if new_message is not old_message:
    741           new_skel.append(self.uberclique.MakeClique(new_message))
    742           continue
    743       new_skel.append(chunk)
    744     self.skeleton_ = new_skel
    745 
    746