source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py @ 1599

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py@1599
Revision 1599, 3.0 KB checked in by selatham, 15 years ago (diff)

oaiClean.py included <doc> element around ingested discovery document.

Line 
1#
2# This code designed to clean up incoming harvested metadata documents for inclusion
3# in the NDG eXist database ...
4# BNL, October 13, 2006
5#
6
7import os
8try:
9    #python 2.5
10    import xml.etree.cElementTree as ET
11except:
12    try:
13    #python 2.4, assumes you have it in the right sort of place ...
14        import cElementTree as ET
15    except:
16        # and give up and raise an error if you haven't even got that ...
17        import ElementTree as ET
18
19def oaiClean(inputDirectoryName,outputDirectoryName,fileName,dcKeyWords=[]):
20   
21    ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes
22    it out in outputDirectoryName, with the following cleaning actions carried out:
23        1) we strip namespaces,
24        2) we add the ndg dcKeywords (a list), and
25        3) wrap it up in an instance of <ndgDoc> '''
26       
27    #construct input and output file identifiers
28    fIn=os.path.join(inputDirectoryName,fileName)
29   
30    #is there any reason why it has to have a different name?
31    fOut=os.path.join(outputDirectoryName,fileName)
32   
33    #load the file
34    inTree=ET.parse(open(fIn))
35   
36    #create the output element and populate it with keywords
37    outTree=ET.Element('ndgDoc')
38    kws=ET.SubElement(outTree,'keywords')
39    for keyword in dcKeyWords: 
40        k=ET.SubElement(kws,'keyword')
41        k.text=keyword
42    doc=ET.SubElement(outTree,'doc')
43   
44   
45    #get the root element of the input and clean it up ...
46    root=inTree.getroot()
47   
48    # fix namespaces ...
49    newDoc=removeNS(root)
50       
51    #bif it in the output
52    doc.append(newDoc)
53   
54    # this is wrong, we need to know what the input encoding is, but it'll do for now.
55    # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm
56    # which also has links to methods of recognising the encoding ...
57    output=ET.ElementTree(outTree)
58    output.write(fOut,encoding='UTF-8')
59   
60def removeNS(e):
61    ''' removes namespaces from element e ... the hardway, there must be an easier way '''
62    tag=e.tag
63    t=tag.split('}') # gets us the ET namespace definition split from the element tag
64    if len(t)<>2: # no namespace
65        y=ET.Element(t[0])
66    else: # namespace exists
67        y=ET.Element(t[1])
68    y.text=e.text
69    y.tail=e.tail
70    for a in e.keys():
71        # can be namespace bumf in attributes as well ... especially in the declarations
72        aa=a.split('}')
73        if len(aa)==1:
74            y.attrib[a]=e.attrib[a]
75        else:
76            if a.find('schemaLocation')==-1: y.attrib[aa[1]]=e.attrib[a]
77    for child in e:
78        y.append(removeNS(child))
79    return y
80   
81
82if __name__=="__main__":
83    import sys
84    try:
85        indir,outdir,filename=sys.argv[1:4]
86        if len(sys.argv)==5: 
87            keywords=sys.argv[4].split(',')
88        else:
89            keywords=[]
90    except:
91        print sys.argv[1:]
92        print "usage: oaiClean indirectory outdirectory filename keyword1,keyword2,... "
93        exit()
94    oaiClean(indir,outdir,filename,keywords)
Note: See TracBrowser for help on using the repository browser.