1 # Authors: John Dennis <jdennis (at] redhat.com> 2 # 3 # Copyright (C) 2007 Red Hat, Inc. 4 # 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 20 21 __all__ = [ 22 'escape_html', 23 'unescape_html', 24 'html_to_text', 25 26 'html_document', 27 ] 28 29 import htmllib 30 import formatter as Formatter 31 import string 32 from types import * 33 import StringIO 34 35 #------------------------------------------------------------------------------ 36 37 38 class TextWriter(Formatter.DumbWriter): 39 40 def __init__(self, file=None, maxcol=80, indent_width=4): 41 Formatter.DumbWriter.__init__(self, file, maxcol) 42 self.indent_level = 0 43 self.indent_width = indent_width 44 self._set_indent() 45 46 def _set_indent(self): 47 self.indent_col = self.indent_level * self.indent_width 48 self.indent = ' ' * self.indent_col 49 50 def new_margin(self, margin, level): 51 self.indent_level = level 52 self._set_indent() 53 54 def send_label_data(self, data): 55 data = data + ' ' 56 if len(data) > self.indent_col: 57 self.send_literal_data(data) 58 else: 59 offset = self.indent_col - len(data) 60 self.send_literal_data(' ' * offset + data) 61 62 def send_flowing_data(self, data): 63 if not data: 64 return 65 atbreak = self.atbreak or data[0] in string.whitespace 66 col = self.col 67 maxcol = self.maxcol 68 write = self.file.write 69 col = self.col 70 if col == 0: 71 write(self.indent) 72 col = self.indent_col 73 for word in data.split(): 74 if atbreak: 75 if col + len(word) >= maxcol: 76 write('\n' + self.indent) 77 col = self.indent_col 78 else: 79 write(' ') 80 col = col + 1 81 write(word) 82 col = col + len(word) 83 atbreak = 1 84 self.col = col 85 self.atbreak = data[-1] in string.whitespace 86 87 88 class HTMLParserAnchor(htmllib.HTMLParser): 89 90 def __init__(self, formatter, verbose=0): 91 htmllib.HTMLParser.__init__(self, formatter, verbose) 92 93 def anchor_bgn(self, href, name, type): 94 self.anchor = href 95 96 def anchor_end(self): 97 if self.anchor: 98 self.handle_data(' (%s) ' % self.anchor) 99 self.anchor = None 100 101 #------------------------------------------------------------------------------ 102 103 104 def escape_html(s): 105 if s is None: 106 return None 107 s = s.replace("&", "&") # Must be done first! 108 s = s.replace("<", "<") 109 s = s.replace(">", ">") 110 s = s.replace("'", "'") 111 s = s.replace('"', """) 112 return s 113 114 115 def unescape_html(s): 116 if s is None: 117 return None 118 if '&' not in s: 119 return s 120 s = s.replace("<", "<") 121 s = s.replace(">", ">") 122 s = s.replace("'", "'") 123 s = s.replace(""", '"') 124 s = s.replace("&", "&") # Must be last 125 return s 126 127 128 def html_to_text(html, maxcol=80): 129 try: 130 buffer = StringIO.StringIO() 131 formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol)) 132 parser = HTMLParserAnchor(formatter) 133 parser.feed(html) 134 parser.close() 135 text = buffer.getvalue() 136 buffer.close() 137 return text 138 except Exception, e: 139 log_program.error('cannot convert html to text: %s' % e) 140 return None 141 142 143 def html_document(*body_components): 144 '''Wrap the body components in a HTML document structure with a valid header. 145 Accepts a variable number of arguments of of which canb be: 146 * string 147 * a sequences of strings (tuple or list). 148 * a callable object taking no parameters and returning a string or sequence of strings. 149 ''' 150 head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n' 151 tail = '\n </body>\n</html>' 152 153 doc = head 154 155 for body_component in body_components: 156 if type(body_component) is StringTypes: 157 doc += body_component 158 elif type(body_component) in [TupleType, ListType]: 159 for item in body_component: 160 doc += item 161 elif callable(body_component): 162 result = body_component() 163 if type(result) in [TupleType, ListType]: 164 for item in result: 165 doc += item 166 else: 167 doc += result 168 else: 169 doc += body_component 170 171 doc += tail 172 return doc 173