1 # Authors: John Dennis <jdennis (at] redhat.com> 2 # 3 # Copyright (C) 2007 Red Hat, Inc. 4 # 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 20 21 __all__ = [ 22 'escape_html', 23 'unescape_html', 24 'html_to_text', 25 26 'html_document', 27 ] 28 29 import htmllib 30 import formatter as Formatter 31 import string 32 from types import * 33 import StringIO 34 35 #------------------------------------------------------------------------------ 36 37 class TextWriter(Formatter.DumbWriter): 38 def __init__(self, file=None, maxcol=80, indent_width=4): 39 Formatter.DumbWriter.__init__(self, file, maxcol) 40 self.indent_level = 0 41 self.indent_width = indent_width 42 self._set_indent() 43 44 def _set_indent(self): 45 self.indent_col = self.indent_level * self.indent_width 46 self.indent = ' ' * self.indent_col 47 48 def new_margin(self, margin, level): 49 self.indent_level = level 50 self._set_indent() 51 52 def send_label_data(self, data): 53 data = data + ' ' 54 if len(data) > self.indent_col: 55 self.send_literal_data(data) 56 else: 57 offset = self.indent_col - len(data) 58 self.send_literal_data(' ' * offset + data) 59 60 def send_flowing_data(self, data): 61 if not data: return 62 atbreak = self.atbreak or data[0] in string.whitespace 63 col = self.col 64 maxcol = self.maxcol 65 write = self.file.write 66 col = self.col 67 if col == 0: 68 write(self.indent) 69 col = self.indent_col 70 for word in data.split(): 71 if atbreak: 72 if col + len(word) >= maxcol: 73 write('\n' + self.indent) 74 col = self.indent_col 75 else: 76 write(' ') 77 col = col + 1 78 write(word) 79 col = col + len(word) 80 atbreak = 1 81 self.col = col 82 self.atbreak = data[-1] in string.whitespace 83 84 class HTMLParserAnchor(htmllib.HTMLParser): 85 86 def __init__(self, formatter, verbose=0): 87 htmllib.HTMLParser.__init__(self, formatter, verbose) 88 89 def anchor_bgn(self, href, name, type): 90 self.anchor = href 91 92 def anchor_end(self): 93 if self.anchor: 94 self.handle_data(' (%s) ' % self.anchor) 95 self.anchor = None 96 97 #------------------------------------------------------------------------------ 98 99 def escape_html(s): 100 if s is None: return None 101 s = s.replace("&", "&") # Must be done first! 102 s = s.replace("<", "<") 103 s = s.replace(">", ">") 104 s = s.replace("'", "'") 105 s = s.replace('"', """) 106 return s 107 108 109 def unescape_html(s): 110 if s is None: return None 111 if '&' not in s: 112 return s 113 s = s.replace("<", "<") 114 s = s.replace(">", ">") 115 s = s.replace("'", "'") 116 s = s.replace(""", '"') 117 s = s.replace("&", "&") # Must be last 118 return s 119 120 def html_to_text(html, maxcol=80): 121 try: 122 buffer = StringIO.StringIO() 123 formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol)) 124 parser = HTMLParserAnchor(formatter) 125 parser.feed(html) 126 parser.close() 127 text = buffer.getvalue() 128 buffer.close() 129 return text 130 except Exception, e: 131 log_program.error('cannot convert html to text: %s' % e) 132 return None 133 134 def html_document(*body_components): 135 '''Wrap the body components in a HTML document structure with a valid header. 136 Accepts a variable number of arguments of of which canb be: 137 * string 138 * a sequences of strings (tuple or list). 139 * a callable object taking no parameters and returning a string or sequence of strings. 140 ''' 141 head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n' 142 tail = '\n </body>\n</html>' 143 144 doc = head 145 146 for body_component in body_components: 147 if type(body_component) is StringTypes: 148 doc += body_component 149 elif type(body_component) in [TupleType, ListType]: 150 for item in body_component: 151 doc += item 152 elif callable(body_component): 153 result = body_component() 154 if type(result) in [TupleType, ListType]: 155 for item in result: 156 doc += item 157 else: 158 doc += result 159 else: 160 doc += body_component 161 162 doc += tail 163 return doc 164