source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py @ 3865

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py@3865
Revision 3865, 4.6 KB checked in by cbyrom, 12 years ago (diff)

Fix keywordAdder so that processing is properly escaped if errors occur
during parsing of file.

Line 
1#!/usr/bin/env python
2'''
3adds Structured Keywords to moles documents
4'''
5try: #python 2.5
6    from xml.etree import ElementTree as etree
7except ImportError:
8    try:
9        # if you've installed it yourself it comes this way
10        import elementtree.ElementTree as etree
11#        import ElementTree as etree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import ElementTree as etree
15
16try: #python 2.5
17    from xml.etree import cElementTree
18except ImportError:
19    try:
20        # if you've installed it yourself it comes this way
21        import cElementTree
22    except ImportError:
23        # if you've egged it this is the way it comes
24        from ndgUtils.elementtree import cElementTree
25import molesReadWrite as MRW
26import sys
27import os
28
29def PrettyPrint(elem,indent='',html=0,space='   '):
30    '''Lightweight pretty printing of elementTree elements'''
31    def estrip(elem):
32        ''' Just want to get rid of unwanted whitespace '''
33        if elem is None:
34            return ''
35        else:
36            return elem.strip()
37    strAttrib=''
38    for att in elem.attrib:
39        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
40    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
41    children=len(elem)
42    if children:
43        for item in elem:
44            result+='\n'+PrettyPrint(item,indent=indent+space)
45        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
46    else:
47        result+='</%s>'%(elem.tag)
48    return result
49
50def main(indir, outdir, keywords):
51    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
52        print "USAGE: keywordAdder indir, outdir, keywords "
53        print " where indir= full path of directory where MOLES records reside,"
54        print "       outdir= full path of where you want the updated records to go."
55        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
56        sys.exit()
57
58    if (len(keywords))%3 != 0:
59        print "Keywords must be in triples. keyword namespace key."
60        sys.exit()
61
62    print "INFO: moles records are in %s" %indir
63    print "INFO: moles records output to %s" %outdir
64    print "INFO: keywords to add are %s" %keywords
65
66    # initialise variables
67    numfilesproc = 0
68    keywordList=[]
69
70    #split the keywords into list of triples
71    count=0
72    while count < len(keywords)/3:
73        #print "count = %s. keywordList = %s" %(count, keywordList)
74        keywordList.append([])
75        keywordList[count].append(keywords[(count*3)])
76        keywordList[count].append(keywords[(count*3)+1])
77        keywordList[count].append(keywords[(count*3)+2])
78        count=count+1
79    #print "Final keywordList = %s" %keywordList
80
81    # moles skeleton for creating new objects
82    M=MRW.MolesDoc()
83
84    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
85    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
86
87    filenames = os.listdir(indir)
88    for filename in filenames:
89        if filename.find('.xml') != -1:
90            full_filename = indir + "/" + filename
91            dgMeta=MRW.dgMetadata()
92            try:
93                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
94                for keyword in keywordList:
95                    strValidTerm= str(keyword[0])
96                    strParentListID=str(keyword[1])
97                    strTermID=str(keyword[2])
98                    print strValidTerm, strParentListID, strTermID
99                    dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
100                    dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
101                    dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
102   
103                # now write out updated document
104                #print dir(dgMeta.dgMetadataRecord)
105                molestree=dgMeta.toXML()
106                moles=PrettyPrint(molestree)
107                f=open(outdir+"/"+filename,'w')
108                f.write(moles)
109                f.close()
110                numfilesproc += 1
111            except:
112                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
113                continue
114
115        else:
116            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
117
118    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
119
120
121if __name__=='__main__':
122    indir=sys.argv[1]
123    outdir=sys.argv[2]
124    keywords=sys.argv[3:]
125    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.