1 | # |
---|
2 | # This code designed to clean up incoming harvested metadata documents for inclusion |
---|
3 | # in the NDG eXist database ... |
---|
4 | # BNL, October 13, 2006 |
---|
5 | # History: |
---|
6 | # SEL 23/11/2006 Make wrapping optional - so can just remove namespace |
---|
7 | |
---|
8 | import os |
---|
9 | try: |
---|
10 | #python 2.5 |
---|
11 | import xml.etree.cElementTree as ET |
---|
12 | except: |
---|
13 | try: |
---|
14 | #python 2.4, assumes you have it in the right sort of place ... |
---|
15 | import cElementTree as ET |
---|
16 | except: |
---|
17 | # and give up and raise an error if you haven't even got that ... |
---|
18 | import ElementTree as ET |
---|
19 | |
---|
20 | def oaiClean(inputDirectoryName,outputDirectoryName,fileName,wrapFlag,dcKeyWords=[]): |
---|
21 | |
---|
22 | ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes |
---|
23 | it out in outputDirectoryName, with the following cleaning actions carried out: |
---|
24 | 1) we strip namespaces, |
---|
25 | Optionally |
---|
26 | 2) we add the ndg dcKeywords (a list), and |
---|
27 | 3) wrap it up in an instance of <ndgDoc> ''' |
---|
28 | |
---|
29 | #construct input and output file identifiers |
---|
30 | fIn=os.path.join(inputDirectoryName,fileName) |
---|
31 | |
---|
32 | #is there any reason why it has to have a different name? |
---|
33 | fOut=os.path.join(outputDirectoryName,fileName) |
---|
34 | |
---|
35 | #load the file |
---|
36 | inTree=ET.parse(open(fIn)) |
---|
37 | |
---|
38 | #get the root element of the input and clean it up ... |
---|
39 | root=inTree.getroot() |
---|
40 | |
---|
41 | # fix namespaces ... |
---|
42 | newDoc=removeNS(root) |
---|
43 | |
---|
44 | |
---|
45 | #if wrapping is indicated, create a wrapper output element and populate it with keywords |
---|
46 | if wrapFlag==True: |
---|
47 | outTree=ET.Element('ndgDoc') |
---|
48 | kws=ET.SubElement(outTree,'keywords') |
---|
49 | for keyword in dcKeyWords: |
---|
50 | k=ET.SubElement(kws,'keyword') |
---|
51 | k.text=keyword |
---|
52 | doc=ET.SubElement(outTree,'doc') |
---|
53 | #bif it in the output |
---|
54 | doc.append(newDoc) |
---|
55 | # this is wrong, we need to know what the input encoding is, but it'll do for now. |
---|
56 | # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm |
---|
57 | # which also has links to methods of recognising the encoding ... |
---|
58 | output=ET.ElementTree(outTree) |
---|
59 | output.write(fOut,encoding='UTF-8') |
---|
60 | else: |
---|
61 | # otherwise just put the cleaned-up original in the output |
---|
62 | output=ET.ElementTree(newDoc) |
---|
63 | output.write(fOut,encoding='UTF-8') |
---|
64 | |
---|
65 | |
---|
66 | |
---|
67 | def removeNS(e): |
---|
68 | ''' removes namespaces from element e ... the hardway, there must be an easier way ''' |
---|
69 | tag=e.tag |
---|
70 | t=tag.split('}') # gets us the ET namespace definition split from the element tag |
---|
71 | if len(t)<>2: # no namespace |
---|
72 | y=ET.Element(t[0]) |
---|
73 | else: # namespace exists |
---|
74 | y=ET.Element(t[1]) |
---|
75 | y.text=e.text |
---|
76 | y.tail=e.tail |
---|
77 | for a in e.keys(): |
---|
78 | # can be namespace bumf in attributes as well ... especially in the declarations |
---|
79 | aa=a.split('}') |
---|
80 | if len(aa)==1: |
---|
81 | y.attrib[a]=e.attrib[a] |
---|
82 | else: |
---|
83 | if a.find('schemaLocation')==-1: y.attrib[aa[1]]=e.attrib[a] |
---|
84 | for child in e: |
---|
85 | y.append(removeNS(child)) |
---|
86 | return y |
---|
87 | |
---|
88 | |
---|
89 | if __name__=="__main__": |
---|
90 | import sys |
---|
91 | try: |
---|
92 | indir,outdir,filename,wrapFlag=sys.argv[1:5] |
---|
93 | if len(sys.argv)==6: |
---|
94 | keywords=sys.argv[5].split(',') |
---|
95 | else: |
---|
96 | keywords=[] |
---|
97 | except: |
---|
98 | print sys.argv[1:] |
---|
99 | print "usage: oaiClean indirectory outdirectory filename wrapFlag keyword1,keyword2,... " |
---|
100 | print " : wrapFlag expects a boolean." |
---|
101 | exit() |
---|
102 | oaiClean(indir,outdir,filename,wrapFlag,keywords) |
---|