1 #!/usr/bin/env python 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 '''A gatherer for the TotalRecall brand of HTML templates with replaceable 7 portions. We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser 8 but this proved impossible due to the fact that the TotalRecall HTML templates 9 are in general quite far from parseable HTML and the TCHTMLParser derives 10 from HTMLParser.HTMLParser which requires relatively well-formed HTML. Some 11 examples of "HTML" from the TotalRecall HTML templates that wouldn't be 12 parseable include things like: 13 14 <a [PARAMS]>blabla</a> (not parseable because attributes are invalid) 15 16 <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing 17 </td> is in the HTML [LOTSOFSTUFF] 18 is replaced by) 19 20 The other problem with using general parsers (such as TCHTMLParser) is that 21 we want to make sure we output the TotalRecall template with as little changes 22 as possible in terms of whitespace characters, layout etc. With any parser 23 that generates a parse tree, and generates output by dumping the parse tree, 24 we would always have little inconsistencies which could cause bugs (the 25 TotalRecall template stuff is quite brittle and can break if e.g. a tab 26 character is replaced with spaces). 27 28 The solution, which may be applicable to some other HTML-like template 29 languages floating around Google, is to create a parser with a simple state 30 machine that keeps track of what kind of tag it's inside, and whether it's in 31 a translateable section or not. Translateable sections are: 32 33 a) text (including [BINGO] replaceables) inside of tags that 34 can contain translateable text (which is all tags except 35 for a few) 36 37 b) text inside of an 'alt' attribute in an <image> element, or 38 the 'value' attribute of a <submit>, <button> or <text> 39 element. 40 41 The parser does not build up a parse tree but rather a "skeleton" which 42 is a list of nontranslateable strings intermingled with grit.clique.MessageClique 43 objects. This simplifies the parser considerably compared to a regular HTML 44 parser. To output a translated document, each item in the skeleton is 45 printed out, with the relevant Translation from each MessageCliques being used 46 for the requested language. 47 48 This implementation borrows some code, constants and ideas from 49 extern.tclib.api.handlers.html.TCHTMLParser. 50 ''' 51 52 53 import re 54 import types 55 56 from grit import clique 57 from grit import exception 58 from grit import lazy_re 59 from grit import util 60 from grit import tclib 61 62 from grit.gather import interface 63 64 65 # HTML tags which break (separate) chunks. 66 _BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br', 67 'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th', 68 'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center', 69 'html', 'link', 'form', 'select', 'textarea', 70 'button', 'option', 'map', 'area', 'blockquote', 'pre', 71 'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead', 72 'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap', 73 'fieldset', 'legend'] 74 75 # HTML tags which may appear within a chunk. 76 _INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small', 77 'key', 'nobr', 'url', 'em', 's', 'sup', 'strike', 78 'strong'] 79 80 # HTML tags within which linebreaks are significant. 81 _PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre'] 82 83 # An array mapping some of the inline HTML tags to more meaningful 84 # names for those tags. This will be used when generating placeholders 85 # representing these tags. 86 _HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold', 87 'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph', 88 'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' } 89 90 # We append each of these characters in sequence to distinguish between 91 # different placeholders with basically the same name (e.g. BOLD1, BOLD2). 92 # Keep in mind that a placeholder name must not be a substring of any other 93 # placeholder name in the same message, so we can't simply count (BOLD_1 94 # would be a substring of BOLD_10). 95 _SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' 96 97 # Matches whitespace in an HTML document. Also matches HTML comments, which are 98 # treated as whitespace. 99 _WHITESPACE = lazy_re.compile(r'(\s| |\\n|\\r|<!--\s*desc\s*=.*?-->)+', 100 re.DOTALL) 101 102 # Matches whitespace sequences which can be folded into a single whitespace 103 # character. This matches single characters so that non-spaces are replaced 104 # with spaces. 105 _FOLD_WHITESPACE = lazy_re.compile(r'\s+') 106 107 # Finds a non-whitespace character 108 _NON_WHITESPACE = lazy_re.compile(r'\S') 109 110 # Matches two or more in a row (a single   is not changed into 111 # placeholders because different languages require different numbers of spaces 112 # and placeholders must match exactly; more than one is probably a "special" 113 # whitespace sequence and should be turned into a placeholder). 114 _NBSP = lazy_re.compile(r' ( )+') 115 116 # Matches nontranslateable chunks of the document 117 _NONTRANSLATEABLES = lazy_re.compile(r''' 118 <\s*script.+?<\s*/\s*script\s*> 119 | 120 <\s*style.+?<\s*/\s*style\s*> 121 | 122 <!--.+?--> 123 | 124 <\?IMPORT\s.+?> # import tag 125 | 126 <\s*[a-zA-Z_]+:.+?> # custom tag (open) 127 | 128 <\s*/\s*[a-zA-Z_]+:.+?> # custom tag (close) 129 | 130 <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?> 131 ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) 132 133 # Matches a tag and its attributes 134 _ELEMENT = lazy_re.compile(r''' 135 # Optional closing /, element name 136 <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s* 137 # Attributes and/or replaceables inside the tag, if any 138 (?P<atts>( 139 \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name 140 (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))? 141 | 142 \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\] 143 )*) 144 \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close 145 ''', 146 re.MULTILINE | re.DOTALL | re.VERBOSE) 147 148 # Matches elements that may have translateable attributes. The value of these 149 # special attributes is given by group 'value1' or 'value2'. Note that this 150 # regexp demands that the attribute value be quoted; this is necessary because 151 # the non-tree-building nature of the parser means we don't know when we're 152 # writing out attributes, so we wouldn't know to escape spaces. 153 _SPECIAL_ELEMENT = lazy_re.compile(r''' 154 <\s*( 155 input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)") 156 [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"? 157 | 158 ( 159 table[^>]+?title\s*= 160 | 161 img[^>]+?alt\s*= 162 | 163 input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*= 164 ) 165 \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)") 166 )[^>]*?> 167 ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) 168 169 # Matches stuff that is translateable if it occurs in the right context 170 # (between tags). This includes all characters and character entities. 171 # Note that this also matches which needs to be handled as whitespace 172 # before this regexp is applied. 173 _CHARACTERS = lazy_re.compile(r''' 174 ( 175 \w 176 | 177 [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~] 178 | 179 &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+); 180 )+ 181 ''', re.MULTILINE | re.DOTALL | re.VERBOSE) 182 183 # Matches Total Recall's "replaceable" tags, which are just any text 184 # in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO], 185 # [~HELLO~] and [$~HELLO~$]). 186 _REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]', 187 re.MULTILINE) 188 189 190 # Matches the silly [!]-prefixed "header" that is used in some TotalRecall 191 # templates. 192 _SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n', 193 re.MULTILINE | re.DOTALL) 194 195 196 # Matches a comment that provides a description for the message it occurs in. 197 _DESCRIPTION_COMMENT = lazy_re.compile( 198 r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL) 199 200 # Matches a comment which is used to break apart multiple messages. 201 _MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->', 202 re.DOTALL) 203 204 # Matches a comment which is used to prevent block tags from splitting a message 205 _MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->', 206 re.DOTALL) 207 208 209 _DEBUG = 0 210 def _DebugPrint(text): 211 if _DEBUG: 212 print text.encode('utf-8') 213 214 215 class HtmlChunks(object): 216 '''A parser that knows how to break an HTML-like document into a list of 217 chunks, where each chunk is either translateable or non-translateable. 218 The chunks are unmodified sections of the original document, so concatenating 219 the text of all chunks would result in the original document.''' 220 221 def InTranslateable(self): 222 return self.last_translateable != -1 223 224 def Rest(self): 225 return self.text_[self.current:] 226 227 def StartTranslateable(self): 228 assert not self.InTranslateable() 229 if self.current != 0: 230 # Append a nontranslateable chunk 231 chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1] 232 # Needed in the case where document starts with a translateable. 233 if len(chunk_text) > 0: 234 self.AddChunk(False, chunk_text) 235 self.chunk_start = self.last_nontranslateable + 1 236 self.last_translateable = self.current 237 self.last_nontranslateable = -1 238 239 def EndTranslateable(self): 240 assert self.InTranslateable() 241 # Append a translateable chunk 242 self.AddChunk(True, 243 self.text_[self.chunk_start : self.last_translateable + 1]) 244 self.chunk_start = self.last_translateable + 1 245 self.last_translateable = -1 246 self.last_nontranslateable = self.current 247 248 def AdvancePast(self, match): 249 self.current += match.end() 250 251 def AddChunk(self, translateable, text): 252 '''Adds a chunk to self, removing linebreaks and duplicate whitespace 253 if appropriate. 254 ''' 255 m = _DESCRIPTION_COMMENT.search(text) 256 if m: 257 self.last_description = m.group('description') 258 # Remove the description from the output text 259 text = _DESCRIPTION_COMMENT.sub('', text) 260 261 m = _MESSAGE_BREAK_COMMENT.search(text) 262 if m: 263 # Remove the coment from the output text. It should already effectively 264 # break apart messages. 265 text = _MESSAGE_BREAK_COMMENT.sub('', text) 266 267 if translateable and not self.last_element_ in _PREFORMATTED_TAGS: 268 if self.fold_whitespace_: 269 # Fold whitespace sequences if appropriate. This is optional because it 270 # alters the output strings. 271 text = _FOLD_WHITESPACE.sub(' ', text) 272 else: 273 text = text.replace('\n', ' ') 274 text = text.replace('\r', ' ') 275 # This whitespace folding doesn't work in all cases, thus the 276 # fold_whitespace flag to support backwards compatibility. 277 text = text.replace(' ', ' ') 278 text = text.replace(' ', ' ') 279 280 if translateable: 281 description = self.last_description 282 self.last_description = '' 283 else: 284 description = '' 285 286 if text != '': 287 self.chunks_.append((translateable, text, description)) 288 289 def Parse(self, text, fold_whitespace): 290 '''Parses self.text_ into an intermediate format stored in self.chunks_ 291 which is translateable and nontranslateable chunks. Also returns 292 self.chunks_ 293 294 Args: 295 text: The HTML for parsing. 296 fold_whitespace: Whether whitespace sequences should be folded into a 297 single space. 298 299 Return: 300 [chunk1, chunk2, chunk3, ...] (instances of class Chunk) 301 ''' 302 # 303 # Chunker state 304 # 305 306 self.text_ = text 307 self.fold_whitespace_ = fold_whitespace 308 309 # A list of tuples (is_translateable, text) which represents the document 310 # after chunking. 311 self.chunks_ = [] 312 313 # Start index of the last chunk, whether translateable or not 314 self.chunk_start = 0 315 316 # Index of the last for-sure translateable character if we are parsing 317 # a translateable chunk, -1 to indicate we are not in a translateable chunk. 318 # This is needed so that we don't include trailing whitespace in the 319 # translateable chunk (whitespace is neutral). 320 self.last_translateable = -1 321 322 # Index of the last for-sure nontranslateable character if we are parsing 323 # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk. 324 # This is needed to make sure we can group e.g. "<b>Hello</b> there" 325 # together instead of just "Hello</b> there" which would be much worse 326 # for translation. 327 self.last_nontranslateable = -1 328 329 # Index of the character we're currently looking at. 330 self.current = 0 331 332 # The name of the last block element parsed. 333 self.last_element_ = '' 334 335 # The last explicit description we found. 336 self.last_description = '' 337 338 # Whether no-break was the last chunk seen 339 self.last_nobreak = False 340 341 while self.current < len(self.text_): 342 _DebugPrint('REST: %s' % self.text_[self.current:self.current+60]) 343 344 m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest()) 345 if m: 346 self.AdvancePast(m) 347 self.last_nobreak = True 348 continue 349 350 # Try to match whitespace 351 m = _WHITESPACE.match(self.Rest()) 352 if m: 353 # Whitespace is neutral, it just advances 'current' and does not switch 354 # between translateable/nontranslateable. If we are in a 355 # nontranslateable section that extends to the current point, we extend 356 # it to include the whitespace. If we are in a translateable section, 357 # we do not extend it until we find 358 # more translateable parts, because we never want a translateable chunk 359 # to end with whitespace. 360 if (not self.InTranslateable() and 361 self.last_nontranslateable == self.current - 1): 362 self.last_nontranslateable = self.current + m.end() - 1 363 self.AdvancePast(m) 364 continue 365 366 # Then we try to match nontranslateables 367 m = _NONTRANSLATEABLES.match(self.Rest()) 368 if m: 369 if self.InTranslateable(): 370 self.EndTranslateable() 371 self.last_nontranslateable = self.current + m.end() - 1 372 self.AdvancePast(m) 373 continue 374 375 # Now match all other HTML element tags (opening, closing, or empty, we 376 # don't care). 377 m = _ELEMENT.match(self.Rest()) 378 if m: 379 element_name = m.group('element').lower() 380 if element_name in _BLOCK_TAGS: 381 self.last_element_ = element_name 382 if self.InTranslateable(): 383 if self.last_nobreak: 384 self.last_nobreak = False 385 else: 386 self.EndTranslateable() 387 388 # Check for "special" elements, i.e. ones that have a translateable 389 # attribute, and handle them correctly. Note that all of the 390 # "special" elements are block tags, so no need to check for this 391 # if the tag is not a block tag. 392 sm = _SPECIAL_ELEMENT.match(self.Rest()) 393 if sm: 394 # Get the appropriate group name 395 for group in sm.groupdict().keys(): 396 if sm.groupdict()[group]: 397 break 398 399 # First make a nontranslateable chunk up to and including the 400 # quote before the translateable attribute value 401 self.AddChunk(False, self.text_[ 402 self.chunk_start : self.current + sm.start(group)]) 403 # Then a translateable for the translateable bit 404 self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)]) 405 # Finally correct the data invariant for the parser 406 self.chunk_start = self.current + sm.end(group) 407 408 self.last_nontranslateable = self.current + m.end() - 1 409 elif self.InTranslateable(): 410 # We're in a translateable and the tag is an inline tag, so we 411 # need to include it in the translateable. 412 self.last_translateable = self.current + m.end() - 1 413 self.AdvancePast(m) 414 continue 415 416 # Anything else we find must be translateable, so we advance one character 417 # at a time until one of the above matches. 418 if not self.InTranslateable(): 419 self.StartTranslateable() 420 else: 421 self.last_translateable = self.current 422 self.current += 1 423 424 # Close the final chunk 425 if self.InTranslateable(): 426 self.AddChunk(True, self.text_[self.chunk_start : ]) 427 else: 428 self.AddChunk(False, self.text_[self.chunk_start : ]) 429 430 return self.chunks_ 431 432 433 def HtmlToMessage(html, include_block_tags=False, description=''): 434 '''Takes a bit of HTML, which must contain only "inline" HTML elements, 435 and changes it into a tclib.Message. This involves escaping any entities and 436 replacing any HTML code with placeholders. 437 438 If include_block_tags is true, no error will be given if block tags (e.g. 439 <p> or <br>) are included in the HTML. 440 441 Args: 442 html: 'Hello <b>[USERNAME]</b>, how <i>are</i> you?' 443 include_block_tags: False 444 445 Return: 446 tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, ' 447 'howNBSPSTART_ITALICareEND_ITALIC you?', 448 [ Placeholder('START_BOLD', '<b>', ''), 449 Placeholder('USERNAME', '[USERNAME]', ''), 450 Placeholder('END_BOLD', '</b>', ''), 451 Placeholder('START_ITALIC', '<i>', ''), 452 Placeholder('END_ITALIC', '</i>', ''), ]) 453 ''' 454 # Approach is: 455 # - first placeholderize, finding <elements>, [REPLACEABLES] and 456 # - then escape all character entities in text in-between placeholders 457 458 parts = [] # List of strings (for text chunks) and tuples (ID, original) 459 # for placeholders 460 461 count_names = {} # Map of base names to number of times used 462 end_names = {} # Map of base names to stack of end tags (for correct nesting) 463 464 def MakeNameClosure(base, type = ''): 465 '''Returns a closure that can be called once all names have been allocated 466 to return the final name of the placeholder. This allows us to minimally 467 number placeholders for non-overlap. 468 469 Also ensures that END_XXX_Y placeholders have the same Y as the 470 corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same 471 type. 472 473 Args: 474 base: 'phname' 475 type: '' | 'begin' | 'end' 476 477 Return: 478 Closure() 479 ''' 480 name = base.upper() 481 if type != '': 482 name = ('%s_%s' % (type, base)).upper() 483 484 if name in count_names.keys(): 485 count_names[name] += 1 486 else: 487 count_names[name] = 1 488 489 def MakeFinalName(name_ = name, index = count_names[name] - 1): 490 if (type.lower() == 'end' and 491 base in end_names.keys() and len(end_names[base])): 492 return end_names[base].pop(-1) # For correct nesting 493 if count_names[name_] != 1: 494 name_ = '%s_%s' % (name_, _SUFFIXES[index]) 495 # We need to use a stack to ensure that the end-tag suffixes match 496 # the begin-tag suffixes. Only needed when more than one tag of the 497 # same type. 498 if type == 'begin': 499 end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper() 500 if base in end_names.keys(): 501 end_names[base].append(end_name) 502 else: 503 end_names[base] = [end_name] 504 505 return name_ 506 507 return MakeFinalName 508 509 current = 0 510 last_nobreak = False 511 512 while current < len(html): 513 m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:]) 514 if m: 515 last_nobreak = True 516 current += m.end() 517 continue 518 519 m = _NBSP.match(html[current:]) 520 if m: 521 parts.append((MakeNameClosure('SPACE'), m.group())) 522 current += m.end() 523 continue 524 525 m = _REPLACEABLE.match(html[current:]) 526 if m: 527 # Replaceables allow - but placeholders don't, so replace - with _ 528 ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_')) 529 parts.append((ph_name, m.group())) 530 current += m.end() 531 continue 532 533 m = _SPECIAL_ELEMENT.match(html[current:]) 534 if m: 535 if not include_block_tags: 536 if last_nobreak: 537 last_nobreak = False 538 else: 539 raise exception.BlockTagInTranslateableChunk(html) 540 element_name = 'block' # for simplification 541 # Get the appropriate group name 542 for group in m.groupdict().keys(): 543 if m.groupdict()[group]: 544 break 545 parts.append((MakeNameClosure(element_name, 'begin'), 546 html[current : current + m.start(group)])) 547 parts.append(m.group(group)) 548 parts.append((MakeNameClosure(element_name, 'end'), 549 html[current + m.end(group) : current + m.end()])) 550 current += m.end() 551 continue 552 553 m = _ELEMENT.match(html[current:]) 554 if m: 555 element_name = m.group('element').lower() 556 if not include_block_tags and not element_name in _INLINE_TAGS: 557 if last_nobreak: 558 last_nobreak = False 559 else: 560 raise exception.BlockTagInTranslateableChunk(html[current:]) 561 if element_name in _HTML_PLACEHOLDER_NAMES: # use meaningful names 562 element_name = _HTML_PLACEHOLDER_NAMES[element_name] 563 564 # Make a name for the placeholder 565 type = '' 566 if not m.group('empty'): 567 if m.group('closing'): 568 type = 'end' 569 else: 570 type = 'begin' 571 parts.append((MakeNameClosure(element_name, type), m.group())) 572 current += m.end() 573 continue 574 575 if len(parts) and isinstance(parts[-1], types.StringTypes): 576 parts[-1] += html[current] 577 else: 578 parts.append(html[current]) 579 current += 1 580 581 msg_text = '' 582 placeholders = [] 583 for part in parts: 584 if isinstance(part, types.TupleType): 585 final_name = part[0]() 586 original = part[1] 587 msg_text += final_name 588 placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)')) 589 else: 590 msg_text += part 591 592 msg = tclib.Message(text=msg_text, placeholders=placeholders, 593 description=description) 594 content = msg.GetContent() 595 for ix in range(len(content)): 596 if isinstance(content[ix], types.StringTypes): 597 content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False) 598 599 return msg 600 601 602 class TrHtml(interface.GathererBase): 603 '''Represents a document or message in the template format used by 604 Total Recall for HTML documents.''' 605 606 def __init__(self, *args, **kwargs): 607 super(TrHtml, self).__init__(*args, **kwargs) 608 self.have_parsed_ = False 609 self.skeleton_ = [] # list of strings and MessageClique objects 610 self.fold_whitespace_ = False 611 612 def SetAttributes(self, attrs): 613 '''Sets node attributes used by the gatherer. 614 615 This checks the fold_whitespace attribute. 616 617 Args: 618 attrs: The mapping of node attributes. 619 ''' 620 self.fold_whitespace_ = ('fold_whitespace' in attrs and 621 attrs['fold_whitespace'] == 'true') 622 623 def GetText(self): 624 '''Returns the original text of the HTML document''' 625 return self.text_ 626 627 def GetTextualIds(self): 628 return [self.extkey] 629 630 def GetCliques(self): 631 '''Returns the message cliques for each translateable message in the 632 document.''' 633 return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)] 634 635 def Translate(self, lang, pseudo_if_not_available=True, 636 skeleton_gatherer=None, fallback_to_english=False): 637 '''Returns this document with translateable messages filled with 638 the translation for language 'lang'. 639 640 Args: 641 lang: 'en' 642 pseudo_if_not_available: True 643 644 Return: 645 'ID_THIS_SECTION TYPE\n...BEGIN\n "Translated message"\n......\nEND 646 647 Raises: 648 grit.exception.NotReady() if used before Parse() has been successfully 649 called. 650 grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false 651 and there is no translation for the requested language. 652 ''' 653 if len(self.skeleton_) == 0: 654 raise exception.NotReady() 655 656 # TODO(joi) Implement support for skeleton gatherers here. 657 658 out = [] 659 for item in self.skeleton_: 660 if isinstance(item, types.StringTypes): 661 out.append(item) 662 else: 663 msg = item.MessageForLanguage(lang, 664 pseudo_if_not_available, 665 fallback_to_english) 666 for content in msg.GetContent(): 667 if isinstance(content, tclib.Placeholder): 668 out.append(content.GetOriginal()) 669 else: 670 # We escape " characters to increase the chance that attributes 671 # will be properly escaped. 672 out.append(util.EscapeHtml(content, True)) 673 674 return ''.join(out) 675 676 def Parse(self): 677 if self.have_parsed_: 678 return 679 self.have_parsed_ = True 680 681 text = self._LoadInputFile() 682 683 # Ignore the BOM character if the document starts with one. 684 if text.startswith(u'\ufeff'): 685 text = text[1:] 686 687 self.text_ = text 688 689 # Parsing is done in two phases: First, we break the document into 690 # translateable and nontranslateable chunks. Second, we run through each 691 # translateable chunk and insert placeholders for any HTML elements, 692 # unescape escaped characters, etc. 693 694 # First handle the silly little [!]-prefixed header because it's not 695 # handled by our HTML parsers. 696 m = _SILLY_HEADER.match(text) 697 if m: 698 self.skeleton_.append(text[:m.start('title')]) 699 self.skeleton_.append(self.uberclique.MakeClique( 700 tclib.Message(text=text[m.start('title'):m.end('title')]))) 701 self.skeleton_.append(text[m.end('title') : m.end()]) 702 text = text[m.end():] 703 704 chunks = HtmlChunks().Parse(text, self.fold_whitespace_) 705 706 for chunk in chunks: 707 if chunk[0]: # Chunk is translateable 708 self.skeleton_.append(self.uberclique.MakeClique( 709 HtmlToMessage(chunk[1], description=chunk[2]))) 710 else: 711 self.skeleton_.append(chunk[1]) 712 713 # Go through the skeleton and change any messages that consist solely of 714 # placeholders and whitespace into nontranslateable strings. 715 for ix in range(len(self.skeleton_)): 716 got_text = False 717 if isinstance(self.skeleton_[ix], clique.MessageClique): 718 msg = self.skeleton_[ix].GetMessage() 719 for item in msg.GetContent(): 720 if (isinstance(item, types.StringTypes) and _NON_WHITESPACE.search(item) 721 and item != ' '): 722 got_text = True 723 break 724 if not got_text: 725 self.skeleton_[ix] = msg.GetRealContent() 726 727 def SubstituteMessages(self, substituter): 728 '''Applies substitutions to all messages in the tree. 729 730 Goes through the skeleton and finds all MessageCliques. 731 732 Args: 733 substituter: a grit.util.Substituter object. 734 ''' 735 new_skel = [] 736 for chunk in self.skeleton_: 737 if isinstance(chunk, clique.MessageClique): 738 old_message = chunk.GetMessage() 739 new_message = substituter.SubstituteMessage(old_message) 740 if new_message is not old_message: 741 new_skel.append(self.uberclique.MakeClique(new_message)) 742 continue 743 new_skel.append(chunk) 744 self.skeleton_ = new_skel 745 746