#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a tree, with optional profiling
"""
import sys
import os
import traceback
from optparse import OptionParser
from html5lib import html5parser, sanitizer
from html5lib.tokenizer import HTMLTokenizer
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
from html5lib import utils
def parse():
optParser = getOptParser()
opts,args = optParser.parse_args()
encoding = "utf8"
try:
f = args[-1]
# Try opening from the internet
if f.startswith('http://'):
try:
import urllib.request, urllib.parse, urllib.error, cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
encoding = params.get('charset')
except:
pass
elif f == '-':
f = sys.stdin
if sys.version_info[0] >= 3:
encoding = None
else:
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
if opts.sanitize:
tokenizer = sanitizer.HTMLSanitizer
else:
tokenizer = HTMLTokenizer
p = html5parser.HTMLParser(tree=treebuilder, tokenizer=tokenizer, debug=opts.log)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
import cProfile
import pstats
cProfile.runctx("run(parseMethod, f, encoding)", None,
{"run": run,
"parseMethod": parseMethod,
"f": f,
"encoding": encoding},
"stats.prof")
# XXX - We should use a temp file here
stats = pstats.Stats('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = run(parseMethod, f, encoding)
t1 = time.time()
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
else:
sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
else:
document = run(parseMethod, f, encoding)
if document:
printOutput(p, document, opts)
def run(parseMethod, f, encoding):
try:
document = parseMethod(f, encoding=encoding)
except:
document = None
traceback.print_exc()
return document
def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
for item in parser.log:
print(item)
if document is not None:
if opts.xml:
tb = opts.treebuilder.lower()
if tb == "dom":
document.writexml(sys.stdout, encoding="utf-8")
elif tb == "lxml":
import lxml.etree
sys.stdout.write(lxml.etree.tostring(document))
elif tb == "etree":
sys.stdout.write(utils.default_etree.tostring(document))
elif opts.tree:
if not hasattr(document,'__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.hilite:
sys.stdout.write(document.hilite("utf-8"))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts,opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'): sys.stdout.write('\n')
if opts.error:
errList=[]
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder", default="etree")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-f", "--fragment", action="store_true", default=False,
dest="fragment", help="Parse as a fragment")
parser.add_option("", "--tree", action="store_true", default=False,
dest="tree", help="Output as debug tree")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="Output as xml")
parser.add_option("", "--no-html", action="store_false", default=True,
dest="html", help="Don't output html")
parser.add_option("", "--hilite", action="store_true", default=False,
dest="hilite", help="Output as formatted highlighted code.")
parser.add_option("-c", "--encoding", action="store_true", default=False,
dest="encoding", help="Print character encoding used")
parser.add_option("", "--inject-meta-charset", action="store_true",
default=False, dest="inject_meta_charset",
help="inject <meta charset>")
parser.add_option("", "--strip-whitespace", action="store_true",
default=False, dest="strip_whitespace",
help="strip whitespace")
parser.add_option("", "--omit-optional-tags", action="store_true",
default=False, dest="omit_optional_tags",
help="omit optional tags")
parser.add_option("", "--quote-attr-values", action="store_true",
default=False, dest="quote_attr_values",
help="quote attribute values")
parser.add_option("", "--use-best-quote-char", action="store_true",
default=False, dest="use_best_quote_char",
help="use best quote character")
parser.add_option("", "--quote-char", action="store",
default=None, dest="quote_char",
help="quote character")
parser.add_option("", "--no-minimize-boolean-attributes",
action="store_false", default=True,
dest="minimize_boolean_attributes",
help="minimize boolean attributes")
parser.add_option("", "--use-trailing-solidus", action="store_true",
default=False, dest="use_trailing_solidus",
help="use trailing solidus")
parser.add_option("", "--space-before-trailing-solidus",
action="store_true", default=False,
dest="space_before_trailing_solidus",
help="add space before trailing solidus")
parser.add_option("", "--escape-lt-in-attrs", action="store_true",
default=False, dest="escape_lt_in_attrs",
help="escape less than signs in attribute values")
parser.add_option("", "--escape-rcdata", action="store_true",
default=False, dest="escape_rcdata",
help="escape rcdata element values")
parser.add_option("", "--sanitize", action="store_true", default=False,
dest="sanitize", help="sanitize")
parser.add_option("-l", "--log", action="store_true", default=False,
dest="log", help="log state transitions")
return parser
if __name__ == "__main__":
parse()