source: TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py @ 3785

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py@3785
Revision 3785, 4.3 KB checked in by selatham, 11 years ago (diff)

gather required utilities. improve elementree imports in keywordAdder. put host etc in arguments for oai_ingest.

Line 
1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
3try: #python 2.5
4    from xml.etree import cElementTree
5    from xml.etree import ElementTree as etree
6except ImportError:
7    try:
8        # if you've installed it yourself it comes this way
9        import cElementTree
10        import elementtree.ElementTree as etree
11    except ImportError:
12        # if you've egged it this is the way it comes
13        from ndgUtils.elementtree import cElementTree
14        from ndgUtils.elementtree import ElementTree as etree
15
16import molesReadWrite as MRW
17import sys
18import os
19
20def PrettyPrint(elem,indent='',html=0,space='   '):
21    '''Lightweight pretty printing of elementTree elements'''
22    def estrip(elem):
23        ''' Just want to get rid of unwanted whitespace '''
24        if elem is None:
25            return ''
26        else:
27            return elem.strip()
28    strAttrib=''
29    for att in elem.attrib:
30        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
31    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
32    children=len(elem)
33    if children:
34        for item in elem:
35            result+='\n'+PrettyPrint(item,indent=indent+space)
36        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
37    else:
38        result+='</%s>'%(elem.tag)
39    return result
40
41def main(indir, outdir, keywords):
42    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
43        print "USAGE: keywordAdder indir, outdir, keywords "
44        print " where indir= full path of directory where MOLES records reside,"
45        print "       outdir= full path of where you want the updated records to go."
46        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
47        sys.exit()
48
49    if (len(keywords))%3 != 0:
50        print "Keywords must be in triples. keyword namespace key."
51        sys.exit()
52
53    print "INFO: moles records are in %s" %indir
54    print "INFO: moles records output to %s" %outdir
55    print "INFO: keywords to add are %s" %keywords
56
57    # initialise variables
58    numfilesproc = 0
59    keywordList=[]
60
61    #split the keywords into list of triples
62    count=0
63    while count < len(keywords)/3:
64        #print "count = %s. keywordList = %s" %(count, keywordList)
65        keywordList.append([])
66        keywordList[count].append(keywords[(count*3)])
67        keywordList[count].append(keywords[(count*3)+1])
68        keywordList[count].append(keywords[(count*3)+2])
69        count=count+1
70    #print "Final keywordList = %s" %keywordList
71
72    # moles skeleton for creating new objects
73    M=MRW.MolesDoc()
74
75    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
76    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
77
78    filenames = os.listdir(indir)
79    for filename in filenames:
80        if filename.find('.xml') != -1:
81            full_filename = indir + "/" + filename
82            dgMeta=MRW.dgMetadata()
83            try:
84                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
85            except:
86                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
87                continue
88
89            for keyword in keywordList:
90                strValidTerm= str(keyword[0])
91                strParentListID=str(keyword[1])
92                strTermID=str(keyword[2])
93                print strValidTerm, strParentListID, strTermID
94                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
95                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
96                dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
97
98            # now write out updated document
99            #print dir(dgMeta.dgMetadataRecord)
100            molestree=dgMeta.toXML()
101            moles=PrettyPrint(molestree)
102            f=open(outdir+"/"+filename,'w')
103            f.write(moles)
104            f.close()
105            numfilesproc += 1
106        else:
107            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
108
109    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
110
111
112if __name__=='__main__':
113    indir=sys.argv[1]
114    outdir=sys.argv[2]
115    keywords=sys.argv[3:]
116    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.