source: TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py @ 3785

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py@3785
Revision 3785, 4.3 KB checked in by selatham, 11 years ago (diff)

gather required utilities. improve elementree imports in keywordAdder. put host etc in arguments for oai_ingest.

RevLine 
[1837]1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
[3785]3try: #python 2.5
4    from xml.etree import cElementTree
5    from xml.etree import ElementTree as etree
6except ImportError:
7    try:
8        # if you've installed it yourself it comes this way
9        import cElementTree
10        import elementtree.ElementTree as etree
11    except ImportError:
12        # if you've egged it this is the way it comes
13        from ndgUtils.elementtree import cElementTree
14        from ndgUtils.elementtree import ElementTree as etree
15
[1837]16import molesReadWrite as MRW
17import sys
18import os
19
[1869]20def PrettyPrint(elem,indent='',html=0,space='   '):
[2066]21    '''Lightweight pretty printing of elementTree elements'''
22    def estrip(elem):
23        ''' Just want to get rid of unwanted whitespace '''
24        if elem is None:
25            return ''
[1869]26        else:
[2066]27            return elem.strip()
28    strAttrib=''
29    for att in elem.attrib:
30        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
31    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
32    children=len(elem)
33    if children:
34        for item in elem:
35            result+='\n'+PrettyPrint(item,indent=indent+space)
36        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
[1869]37    else:
[2066]38        result+='</%s>'%(elem.tag)
39    return result
[1869]40
[1837]41def main(indir, outdir, keywords):
[2088]42    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
43        print "USAGE: keywordAdder indir, outdir, keywords "
[2066]44        print " where indir= full path of directory where MOLES records reside,"
45        print "       outdir= full path of where you want the updated records to go."
[2088]46        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
47        sys.exit()
[1837]48
[2088]49    if (len(keywords))%3 != 0:
50        print "Keywords must be in triples. keyword namespace key."
51        sys.exit()
52
[1837]53    print "INFO: moles records are in %s" %indir
54    print "INFO: moles records output to %s" %outdir
55    print "INFO: keywords to add are %s" %keywords
56
[1869]57    # initialise variables
58    numfilesproc = 0
[2088]59    keywordList=[]
[1869]60
[2088]61    #split the keywords into list of triples
62    count=0
63    while count < len(keywords)/3:
64        #print "count = %s. keywordList = %s" %(count, keywordList)
65        keywordList.append([])
66        keywordList[count].append(keywords[(count*3)])
67        keywordList[count].append(keywords[(count*3)+1])
68        keywordList[count].append(keywords[(count*3)+2])
69        count=count+1
70    #print "Final keywordList = %s" %keywordList
71
[1869]72    # moles skeleton for creating new objects
73    M=MRW.MolesDoc()
74
[1837]75    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
76    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
77
78    filenames = os.listdir(indir)
79    for filename in filenames:
[2066]80        if filename.find('.xml') != -1:
81            full_filename = indir + "/" + filename
82            dgMeta=MRW.dgMetadata()
83            try:
84                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
85            except:
86                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
87                continue
[1837]88
[2088]89            for keyword in keywordList:
90                strValidTerm= str(keyword[0])
91                strParentListID=str(keyword[1])
92                strTermID=str(keyword[2])
93                print strValidTerm, strParentListID, strTermID
94                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
95                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
96                dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
[1869]97
[2066]98            # now write out updated document
[2088]99            #print dir(dgMeta.dgMetadataRecord)
[2066]100            molestree=dgMeta.toXML()
101            moles=PrettyPrint(molestree)
102            f=open(outdir+"/"+filename,'w')
103            f.write(moles)
104            f.close()
105            numfilesproc += 1
106        else:
107            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
[1837]108
109    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
110
111
112if __name__=='__main__':
[2066]113    indir=sys.argv[1]
114    outdir=sys.argv[2]
115    keywords=sys.argv[3:]
116    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.