# Authors: John Dennis <jdennis@redhat.com>
#
# Copyright (C) 2007 Red Hat, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
__all__ = [
'escape_html',
'unescape_html',
'html_to_text',
'html_document',
]
import htmllib
import formatter as Formatter
import string
from types import *
import StringIO
#------------------------------------------------------------------------------
class TextWriter(Formatter.DumbWriter):
def __init__(self, file=None, maxcol=80, indent_width=4):
Formatter.DumbWriter.__init__(self, file, maxcol)
self.indent_level = 0
self.indent_width = indent_width
self._set_indent()
def _set_indent(self):
self.indent_col = self.indent_level * self.indent_width
self.indent = ' ' * self.indent_col
def new_margin(self, margin, level):
self.indent_level = level
self._set_indent()
def send_label_data(self, data):
data = data + ' '
if len(data) > self.indent_col:
self.send_literal_data(data)
else:
offset = self.indent_col - len(data)
self.send_literal_data(' ' * offset + data)
def send_flowing_data(self, data):
if not data:
return
atbreak = self.atbreak or data[0] in string.whitespace
col = self.col
maxcol = self.maxcol
write = self.file.write
col = self.col
if col == 0:
write(self.indent)
col = self.indent_col
for word in data.split():
if atbreak:
if col + len(word) >= maxcol:
write('\n' + self.indent)
col = self.indent_col
else:
write(' ')
col = col + 1
write(word)
col = col + len(word)
atbreak = 1
self.col = col
self.atbreak = data[-1] in string.whitespace
class HTMLParserAnchor(htmllib.HTMLParser):
def __init__(self, formatter, verbose=0):
htmllib.HTMLParser.__init__(self, formatter, verbose)
def anchor_bgn(self, href, name, type):
self.anchor = href
def anchor_end(self):
if self.anchor:
self.handle_data(' (%s) ' % self.anchor)
self.anchor = None
#------------------------------------------------------------------------------
def escape_html(s):
if s is None:
return None
s = s.replace("&", "&") # Must be done first!
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace('"', """)
return s
def unescape_html(s):
if s is None:
return None
if '&' not in s:
return s
s = s.replace("<", "<")
s = s.replace(">", ">")
s = s.replace("'", "'")
s = s.replace(""", '"')
s = s.replace("&", "&") # Must be last
return s
def html_to_text(html, maxcol=80):
try:
buffer = StringIO.StringIO()
formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
parser = HTMLParserAnchor(formatter)
parser.feed(html)
parser.close()
text = buffer.getvalue()
buffer.close()
return text
except Exception, e:
log_program.error('cannot convert html to text: %s' % e)
return None
def html_document(*body_components):
'''Wrap the body components in a HTML document structure with a valid header.
Accepts a variable number of arguments of of which canb be:
* string
* a sequences of strings (tuple or list).
* a callable object taking no parameters and returning a string or sequence of strings.
'''
head = '<html>\n <head>\n <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n </head>\n <body>\n'
tail = '\n </body>\n</html>'
doc = head
for body_component in body_components:
if type(body_component) is StringTypes:
doc += body_component
elif type(body_component) in [TupleType, ListType]:
for item in body_component:
doc += item
elif callable(body_component):
result = body_component()
if type(result) in [TupleType, ListType]:
for item in result:
doc += item
else:
doc += result
else:
doc += body_component
doc += tail
return doc