source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py @ 1754

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py@2378
Revision 1754, 3.3 KB checked in by selatham, 14 years ago (diff)

made ndgdoc style wrapping optional. Turned off in script

Line 
1#
2# This code designed to clean up incoming harvested metadata documents for inclusion
3# in the NDG eXist database ...
4# BNL, October 13, 2006
5# History:
6#  SEL 23/11/2006  Make wrapping optional - so can just remove namespace
7
8import os
9try:
10    #python 2.5
11    import xml.etree.cElementTree as ET
12except:
13    try:
14    #python 2.4, assumes you have it in the right sort of place ...
15        import cElementTree as ET
16    except:
17        # and give up and raise an error if you haven't even got that ...
18        import ElementTree as ET
19
20def oaiClean(inputDirectoryName,outputDirectoryName,fileName,wrapFlag,dcKeyWords=[]):
21   
22    ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes
23    it out in outputDirectoryName, with the following cleaning actions carried out:
24        1) we strip namespaces,
25        Optionally
26        2) we add the ndg dcKeywords (a list), and
27        3) wrap it up in an instance of <ndgDoc> '''
28       
29    #construct input and output file identifiers
30    fIn=os.path.join(inputDirectoryName,fileName)
31   
32    #is there any reason why it has to have a different name?
33    fOut=os.path.join(outputDirectoryName,fileName)
34   
35    #load the file
36    inTree=ET.parse(open(fIn))
37   
38    #get the root element of the input and clean it up ...
39    root=inTree.getroot()
40
41    # fix namespaces ...
42    newDoc=removeNS(root)
43
44
45    #if wrapping is indicated, create a wrapper output element and populate it with keywords
46    if wrapFlag==True:
47        outTree=ET.Element('ndgDoc')
48        kws=ET.SubElement(outTree,'keywords')
49        for keyword in dcKeyWords:
50                k=ET.SubElement(kws,'keyword')
51                k.text=keyword
52        doc=ET.SubElement(outTree,'doc')
53        #bif it in the output
54        doc.append(newDoc)
55        # this is wrong, we need to know what the input encoding is, but it'll do for now.
56        # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm
57        # which also has links to methods of recognising the encoding ...
58        output=ET.ElementTree(outTree)
59        output.write(fOut,encoding='UTF-8')
60    else:
61        # otherwise just put the cleaned-up original in the output
62        output=ET.ElementTree(newDoc)
63        output.write(fOut,encoding='UTF-8')
64
65
66
67def removeNS(e):
68    ''' removes namespaces from element e ... the hardway, there must be an easier way '''
69    tag=e.tag
70    t=tag.split('}') # gets us the ET namespace definition split from the element tag
71    if len(t)<>2: # no namespace
72        y=ET.Element(t[0])
73    else: # namespace exists
74        y=ET.Element(t[1])
75    y.text=e.text
76    y.tail=e.tail
77    for a in e.keys():
78        # can be namespace bumf in attributes as well ... especially in the declarations
79        aa=a.split('}')
80        if len(aa)==1:
81            y.attrib[a]=e.attrib[a]
82        else:
83            if a.find('schemaLocation')==-1: y.attrib[aa[1]]=e.attrib[a]
84    for child in e:
85        y.append(removeNS(child))
86    return y
87
88
89if __name__=="__main__":
90    import sys
91    try:
92        indir,outdir,filename,wrapFlag=sys.argv[1:5]
93        if len(sys.argv)==6:
94            keywords=sys.argv[5].split(',')
95        else:
96            keywords=[]
97    except:
98        print sys.argv[1:]
99        print "usage: oaiClean indirectory outdirectory filename wrapFlag keyword1,keyword2,... "
100        print "     : wrapFlag expects a boolean."
101        exit()
102    oaiClean(indir,outdir,filename,wrapFlag,keywords)
Note: See TracBrowser for help on using the repository browser.