source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py @ 1590

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py@1590
Revision 1590, 3.0 KB checked in by selatham, 14 years ago (diff)

created a python version of the discovery record cleaning code. Also wraps with ndgdoc and keyword tags.

Line 
1#
2# This code designed to clean up incoming harvested metadata documents for inclusion
3# in the NDG eXist database ...
4# BNL, October 13, 2006
5#
6
7import os
8try:
9    #python 2.5
10    import xml.etree.cElementTree as ET
11except:
12    try:
13    #python 2.4, assumes you have it in the right sort of place ...
14        import cElementTree as ET
15    except:
16        # and give up and raise an error if you haven't even got that ...
17        import ElementTree as ET
18
19def oaiClean(inputDirectoryName,outputDirectoryName,fileName,dcKeyWords=[]):
20   
21    ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes
22    it out in outputDirectoryName, with the following cleaning actions carried out:
23        1) we strip namespaces,
24        2) we add the ndg dcKeywords (a list), and
25        3) wrap it up in an instance of <ndgDoc> '''
26       
27    #construct input and output file identifiers
28    fIn=os.path.join(inputDirectoryName,fileName)
29   
30    #is there any reason why it has to have a different name?
31    fOut=os.path.join(outputDirectoryName,fileName)
32   
33    #load the file
34    inTree=ET.parse(open(fIn))
35   
36    #create the output element and populate it with keywords
37    outTree=ET.Element('ndgDoc')
38    kws=ET.SubElement(outTree,'keywords')
39    for keyword in dcKeyWords: 
40        k=ET.SubElement(kws,'keyword')
41        k.text=keyword
42   
43   
44    #get the root element of the input and clean it up ...
45    root=inTree.getroot()
46   
47    # fix namespaces ...
48    newDoc=removeNS(root)
49       
50    #bif it in the output
51    outTree.append(newDoc)
52   
53    # this is wrong, we need to know what the input encoding is, but it'll do for now.
54    # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm
55    # which also has links to methods of recognising the encoding ...
56    output=ET.ElementTree(outTree)
57    output.write(fOut,encoding='UTF-8')
58   
59def removeNS(e):
60    ''' removes namespaces from element e ... the hardway, there must be an easier way '''
61    tag=e.tag
62    t=tag.split('}') # gets us the ET namespace definition split from the element tag
63    if len(t)<>2: # no namespace
64        y=ET.Element(t[0])
65    else: # namespace exists
66        y=ET.Element(t[1])
67    y.text=e.text
68    y.tail=e.tail
69    for a in e.keys():
70        # can be namespace bumf in attributes as well ... especially in the declarations
71        aa=a.split('}')
72        if len(aa)==1:
73            y.attrib[a]=e.attrib[a]
74        else:
75            if a.find('schemaLocation')==-1: y.attrib[aa[1]]=e.attrib[a]
76    for child in e:
77        y.append(removeNS(child))
78    return y
79   
80
81if __name__=="__main__":
82    import sys
83    try:
84        indir,outdir,filename=sys.argv[1:4]
85        if len(sys.argv)==5: 
86            keywords=sys.argv[4].split(',')
87        else:
88            keywords=[]
89    except:
90        print sys.argv[1:]
91        print "usage: oaiClean indirectory outdirectory filename keyword1,keyword2,... "
92        exit()
93    oaiClean(indir,outdir,filename,keywords)
Note: See TracBrowser for help on using the repository browser.