1 # Authors: John Dennis <jdennis (at] redhat.com> 2 # 3 # Copyright (C) 2007 Red Hat, Inc. 4 # 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License 16 # along with this program; if not, write to the Free Software 17 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 # 19 20 21 __all__ = [ 22 'escape_html', 23 'unescape_html', 24 'html_to_text', 25 26 'html_document', 27 ] 28 29 import htmllib 30 import formatter as Formatter 31 import string 32 from types import * 33 try: 34 from io import StringIO 35 except ImportError: 36 from StringIO import StringIO 37 38 #------------------------------------------------------------------------------ 39 40 41 class TextWriter(Formatter.DumbWriter): 42 43 def __init__(self, file=None, maxcol=80, indent_width=4): 44 Formatter.DumbWriter.__init__(self, file, maxcol) 45 self.indent_level = 0 46 self.indent_width = indent_width 47 self._set_indent() 48 49 def _set_indent(self): 50 self.indent_col = self.indent_level * self.indent_width 51 self.indent = ' ' * self.indent_col 52 53 def new_margin(self, margin, level): 54 self.indent_level = level 55 self._set_indent() 56 57 def send_label_data(self, data): 58 data = data + ' ' 59 if len(data) > self.indent_col: 60 self.send_literal_data(data) 61 else: 62 offset = self.indent_col - len(data) 63 self.send_literal_data(' ' * offset + data) 64 65 def send_flowing_data(self, data): 66 if not data: 67 return 68 atbreak = self.atbreak or data[0] in string.whitespace 69 col = self.col 70 maxcol = self.maxcol 71 write = self.file.write 72 col = self.col 73 if col == 0: 74 write(self.indent) 75 col = self.indent_col 76 for word in data.split(): 77 if atbreak: 78 if col + len(word) >= maxcol: 79 write('\n' + self.indent) 80 col = self.indent_col 81 else: 82 write(' ') 83 col = col + 1 84 write(word) 85 col = col + len(word) 86 atbreak = 1 87 self.col = col 88 self.atbreak = data[-1] in string.whitespace 89 90 91 class HTMLParserAnchor(htmllib.HTMLParser): 92 93 def __init__(self, formatter, verbose=0): 94 htmllib.HTMLParser.__init__(self, formatter, verbose) 95 96 def anchor_bgn(self, href, name, type): 97 self.anchor = href 98 99 def anchor_end(self): 100 if self.anchor: 101 self.handle_data(' (%s) ' % self.anchor) 102 self.anchor = None 103 104 #------------------------------------------------------------------------------ 105 106 107 def escape_html(s): 108 if s is None: 109 return None 110 s = s.replace("&", "&") # Must be done first! 111 s = s.replace("<", "<") 112 s = s.replace(">", ">") 113 s = s.replace("'", "'") 114 s = s.replace('"', """) 115 return s 116 117 118 def unescape_html(s): 119 if s is None: 120 return None 121 if '&' not in s: 122 return s 123 s = s.replace("<", "<") 124 s = s.replace(">", ">") 125 s = s.replace("'", "'") 126 s = s.replace(""", '"') 127 s = s.replace("&", "&") # Must be last 128 return s 129 130 131 def html_to_text(html, maxcol=80): 132 try: 133 buffer = StringIO() 134 formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol)) 135 parser = HTMLParserAnchor(formatter) 136 parser.feed(html) 137 parser.close() 138 text = buffer.getvalue() 139 buffer.close() 140 return text 141 except Exception as e: 142 log_program.error('cannot convert html to text: %s' % e) 143 return None 144 145 146 def html_document(*body_components): 147 '''Wrap the body components in a HTML document structure with a valid header. 148 Accepts a variable number of arguments of of which canb be: 149 * string 150 * a sequences of strings (tuple or list). 151 * a callable object taking no parameters and returning a string or sequence of strings. 152 ''' 153 head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n' 154 tail = '\n </body>\n</html>' 155 156 doc = head 157 158 for body_component in body_components: 159 if type(body_component) is StringTypes: 160 doc += body_component 161 elif type(body_component) in [TupleType, ListType]: 162 for item in body_component: 163 doc += item 164 elif callable(body_component): 165 result = body_component() 166 if type(result) in [TupleType, ListType]: 167 for item in result: 168 doc += item 169 else: 170 doc += result 171 else: 172 doc += body_component 173 174 doc += tail 175 return doc 176