#!/usr/bin/python # Copyright (c) 2011 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. '''This utility cleans up the html files as emitted by doxygen so that they are suitable for publication on a Google documentation site. ''' import optparse import os import re import shutil import string import sys try: from BeautifulSoup import BeautifulSoup, Tag except (ImportError, NotImplementedError): print ("This tool requires the BeautifulSoup package " "(see http://www.crummy.com/software/BeautifulSoup/).\n" "Make sure that the file BeautifulSoup.py is either in this directory " "or is available in your PYTHON_PATH") raise class HTMLFixer(object): '''This class cleans up the html strings as produced by Doxygen ''' def __init__(self, html): self.soup = BeautifulSoup(html) def FixTableHeadings(self): '''Fixes the doxygen table headings. This includes: - Using bare <h2> title row instead of row embedded in <tr><td> in table - Putting the "name" attribute into the "id" attribute of the <tr> tag. - Splitting up tables into multiple separate tables if a table heading appears in the middle of a table. For example, this html: <table> <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> ... </table> would be converted to this: <h2>Data Fields List</h2> <table> ... </table> ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: #tag['id'] = tag.td.h2.a['name'] tag.string = tag.td.h2.a.next tag.name = 'h2' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: print "Header tag: %s is %s" % (tag.name, tag.string.strip()) # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': print "Splitting Table named %s" % tag.string.strip() table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) for index, row in enumerate(table.contents[tag_index:]): new_table.insert(index, row) # Now move the <h2> tag to be in front of the <table> tag assert tag.parent.name == 'table' table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) table_parent.insert(table_index, tag) def RemoveTopHeadings(self): '''Removes <div> sections with a header, tabs, or navpath class attribute''' header_tags = self.soup.findAll( name='div', attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')}) [tag.extract() for tag in header_tags] def FixAll(self): self.FixTableHeadings() self.RemoveTopHeadings() def __str__(self): return str(self.soup) def main(): '''Main entry for the doxy_cleanup utility doxy_cleanup takes a list of html files and modifies them in place.''' parser = optparse.OptionParser(usage='Usage: %prog [options] files...') parser.add_option('-m', '--move', dest='move', action='store_true', default=False, help='move html files to "original_html"') options, files = parser.parse_args() if not files: parser.print_usage() return 1 for filename in files: try: with open(filename, 'r') as file: html = file.read() print "Processing %s" % filename fixer = HTMLFixer(html) fixer.FixAll() with open(filename, 'w') as file: file.write(str(fixer)) if options.move: new_directory = os.path.join( os.path.dirname(os.path.dirname(filename)), 'original_html') if not os.path.exists(new_directory): os.mkdir(new_directory) shutil.move(filename, new_directory) except: print "Error while processing %s" % filename raise return 0 if __name__ == '__main__': sys.exit(main())