source: exist/trunk/python/elementtree-1.3a6-20070220-badc/elementtree/TidyTools.py @ 3578

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/elementtree-1.3a6-20070220-badc/elementtree/TidyTools.py@3578
Revision 3578, 3.2 KB checked in by pjkersha, 11 years ago (diff)

Latest releases from Fredrik Lundh. 10 March release has exclusive C14N support with namespace prefixes.

Line 
1#
2# ElementTree
3# $Id: TidyTools.py 3375 2008-02-13 08:05:08Z fredrik $
4#
5# tools to run the "tidy" command on an HTML or XHTML file, and return
6# the contents as an XHTML element tree.
7#
8# history:
9# 2002-10-19 fl   added to ElementTree library; added getzonebody function
10#
11# Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.
12#
13# fredrik@pythonware.com
14# http://www.pythonware.com
15#
16
17##
18# Tools to build element trees from HTML, using the external <b>tidy</b>
19# utility.
20##
21
22import glob, string, os, sys
23
24from ElementTree import ElementTree, Element
25
26NS_XHTML = "{http://www.w3.org/1999/xhtml}"
27
28##
29# Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
30# command line utility.
31#
32# @param file Filename.
33# @param new_inline_tags An optional list of valid but non-standard
34#     inline tags.
35# @return An element tree, or None if not successful.
36
37def tidy(file, new_inline_tags=None):
38
39    command = ["tidy", "-qn", "-asxml"]
40
41    if new_inline_tags:
42        command.append("--new-inline-tags")
43        command.append(string.join(new_inline_tags, ","))
44
45    # FIXME: support more tidy options!
46
47    # convert
48    os.system(
49        "%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file)
50        )
51    # check that the result is valid XML
52    try:
53        tree = ElementTree()
54        tree.parse(file + ".out")
55    except:
56        print "*** %s:%s" % sys.exc_info()[:2]
57        print ("*** %s is not valid XML "
58               "(check %s.err for info)" % (file, file))
59        tree = None
60    else:
61        if os.path.isfile(file + ".out"):
62            os.remove(file + ".out")
63        if os.path.isfile(file + ".err"):
64            os.remove(file + ".err")
65
66    return tree
67
68##
69# Get document body from a an HTML or HTML-like file.  This function
70# uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
71# up the resulting XML tree.
72#
73# @param file Filename.
74# @return A <b>body</b> element, or None if not successful.
75
76def getbody(file, **options):
77    # get clean body from text file
78
79    # get xhtml tree
80    try:
81        tree = apply(tidy, (file,), options)
82        if tree is None:
83            return
84    except IOError, v:
85        print "***", v
86        return None
87
88    NS = NS_XHTML
89
90    # remove namespace uris
91    for node in tree.getiterator():
92        if node.tag.startswith(NS):
93            node.tag = node.tag[len(NS):]
94
95    body = tree.getroot().find("body")
96
97    return body
98
99##
100# Same as <b>getbody</b>, but turns plain text at the start of the
101# document into an H1 tag.  This function can be used to parse zone
102# documents.
103#
104# @param file Filename.
105# @return A <b>body</b> element, or None if not successful.
106
107def getzonebody(file, **options):
108
109    body = getbody(file, **options)
110    if body is None:
111        return
112
113    if body.text and string.strip(body.text):
114        title = Element("h1")
115        title.text = string.strip(body.text)
116        title.tail = "\n\n"
117        body.insert(0, title)
118
119    body.text = None
120
121    return body
122
123if __name__ == "__main__":
124
125    import sys
126    for arg in sys.argv[1:]:
127        for file in glob.glob(arg):
128            print file, "...", tidy(file)
Note: See TracBrowser for help on using the repository browser.