# Authors: John Dennis <jdennis@redhat.com> # # Copyright (C) 2007 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # __all__ = [ 'escape_html', 'unescape_html', 'html_to_text', 'html_document', ] import htmllib import formatter as Formatter import string from types import * import StringIO #------------------------------------------------------------------------------ class TextWriter(Formatter.DumbWriter): def __init__(self, file=None, maxcol=80, indent_width=4): Formatter.DumbWriter.__init__(self, file, maxcol) self.indent_level = 0 self.indent_width = indent_width self._set_indent() def _set_indent(self): self.indent_col = self.indent_level * self.indent_width self.indent = ' ' * self.indent_col def new_margin(self, margin, level): self.indent_level = level self._set_indent() def send_label_data(self, data): data = data + ' ' if len(data) > self.indent_col: self.send_literal_data(data) else: offset = self.indent_col - len(data) self.send_literal_data(' ' * offset + data) def send_flowing_data(self, data): if not data: return atbreak = self.atbreak or data[0] in string.whitespace col = self.col maxcol = self.maxcol write = self.file.write col = self.col if col == 0: write(self.indent) col = self.indent_col for word in data.split(): if atbreak: if col + len(word) >= maxcol: write('\n' + self.indent) col = self.indent_col else: write(' ') col = col + 1 write(word) col = col + len(word) atbreak = 1 self.col = col self.atbreak = data[-1] in string.whitespace class HTMLParserAnchor(htmllib.HTMLParser): def __init__(self, formatter, verbose=0): htmllib.HTMLParser.__init__(self, formatter, verbose) def anchor_bgn(self, href, name, type): self.anchor = href def anchor_end(self): if self.anchor: self.handle_data(' (%s) ' % self.anchor) self.anchor = None #------------------------------------------------------------------------------ def escape_html(s): if s is None: return None s = s.replace("&", "&") # Must be done first! s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace('"', """) return s def unescape_html(s): if s is None: return None if '&' not in s: return s s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace(""", '"') s = s.replace("&", "&") # Must be last return s def html_to_text(html, maxcol=80): try: buffer = StringIO.StringIO() formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol)) parser = HTMLParserAnchor(formatter) parser.feed(html) parser.close() text = buffer.getvalue() buffer.close() return text except Exception, e: log_program.error('cannot convert html to text: %s' % e) return None def html_document(*body_components): '''Wrap the body components in a HTML document structure with a valid header. Accepts a variable number of arguments of of which canb be: * string * a sequences of strings (tuple or list). * a callable object taking no parameters and returning a string or sequence of strings. ''' head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n' tail = '\n </body>\n</html>' doc = head for body_component in body_components: if type(body_component) is StringTypes: doc += body_component elif type(body_component) in [TupleType, ListType]: for item in body_component: doc += item elif callable(body_component): result = body_component() if type(result) in [TupleType, ListType]: for item in result: doc += item else: doc += result else: doc += body_component doc += tail return doc