1 | # |
---|
2 | # This code designed to clean up incoming harvested metadata documents for inclusion |
---|
3 | # in the NDG eXist database ... |
---|
4 | # BNL, October 13, 2006 |
---|
5 | # |
---|
6 | |
---|
7 | import os |
---|
8 | try: |
---|
9 | #python 2.5 |
---|
10 | import xml.etree.cElementTree as ET |
---|
11 | except: |
---|
12 | try: |
---|
13 | #python 2.4, assumes you have it in the right sort of place ... |
---|
14 | import cElementTree as ET |
---|
15 | except: |
---|
16 | # and give up and raise an error if you haven't even got that ... |
---|
17 | import ElementTree as ET |
---|
18 | |
---|
19 | def oaiClean(inputDirectoryName,outputDirectoryName,fileName,dcKeyWords=[]): |
---|
20 | |
---|
21 | ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes |
---|
22 | it out in outputDirectoryName, with the following cleaning actions carried out: |
---|
23 | 1) we strip namespaces, |
---|
24 | 2) we add the ndg dcKeywords (a list), and |
---|
25 | 3) wrap it up in an instance of <ndgDoc> ''' |
---|
26 | |
---|
27 | #construct input and output file identifiers |
---|
28 | fIn=os.path.join(inputDirectoryName,fileName) |
---|
29 | |
---|
30 | #is there any reason why it has to have a different name? |
---|
31 | fOut=os.path.join(outputDirectoryName,fileName) |
---|
32 | |
---|
33 | #load the file |
---|
34 | inTree=ET.parse(open(fIn)) |
---|
35 | |
---|
36 | #create the output element and populate it with keywords |
---|
37 | outTree=ET.Element('ndgDoc') |
---|
38 | kws=ET.SubElement(outTree,'keywords') |
---|
39 | for keyword in dcKeyWords: |
---|
40 | k=ET.SubElement(kws,'keyword') |
---|
41 | k.text=keyword |
---|
42 | doc=ET.SubElement(outTree,'doc') |
---|
43 | |
---|
44 | |
---|
45 | #get the root element of the input and clean it up ... |
---|
46 | root=inTree.getroot() |
---|
47 | |
---|
48 | # fix namespaces ... |
---|
49 | newDoc=removeNS(root) |
---|
50 | |
---|
51 | #bif it in the output |
---|
52 | doc.append(newDoc) |
---|
53 | |
---|
54 | # this is wrong, we need to know what the input encoding is, but it'll do for now. |
---|
55 | # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm |
---|
56 | # which also has links to methods of recognising the encoding ... |
---|
57 | output=ET.ElementTree(outTree) |
---|
58 | output.write(fOut,encoding='UTF-8') |
---|
59 | |
---|
60 | def removeNS(e): |
---|
61 | ''' removes namespaces from element e ... the hardway, there must be an easier way ''' |
---|
62 | tag=e.tag |
---|
63 | t=tag.split('}') # gets us the ET namespace definition split from the element tag |
---|
64 | if len(t)<>2: # no namespace |
---|
65 | y=ET.Element(t[0]) |
---|
66 | else: # namespace exists |
---|
67 | y=ET.Element(t[1]) |
---|
68 | y.text=e.text |
---|
69 | y.tail=e.tail |
---|
70 | for a in e.keys(): |
---|
71 | # can be namespace bumf in attributes as well ... especially in the declarations |
---|
72 | aa=a.split('}') |
---|
73 | if len(aa)==1: |
---|
74 | y.attrib[a]=e.attrib[a] |
---|
75 | else: |
---|
76 | if a.find('schemaLocation')==-1: y.attrib[aa[1]]=e.attrib[a] |
---|
77 | for child in e: |
---|
78 | y.append(removeNS(child)) |
---|
79 | return y |
---|
80 | |
---|
81 | |
---|
82 | if __name__=="__main__": |
---|
83 | import sys |
---|
84 | try: |
---|
85 | indir,outdir,filename=sys.argv[1:4] |
---|
86 | if len(sys.argv)==5: |
---|
87 | keywords=sys.argv[4].split(',') |
---|
88 | else: |
---|
89 | keywords=[] |
---|
90 | except: |
---|
91 | print sys.argv[1:] |
---|
92 | print "usage: oaiClean indirectory outdirectory filename keyword1,keyword2,... " |
---|
93 | exit() |
---|
94 | oaiClean(indir,outdir,filename,keywords) |
---|