source: TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py @ 2088

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py@2378
Revision 2088, 3.9 KB checked in by selatham, 14 years ago (diff)

implementing keywordAdder.

Line 
1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
3import cElementTree
4import elementtree.ElementTree as etree
5import molesReadWrite as MRW
6import sys
7import os
8
9def PrettyPrint(elem,indent='',html=0,space='   '):
10    '''Lightweight pretty printing of elementTree elements'''
11    def estrip(elem):
12        ''' Just want to get rid of unwanted whitespace '''
13        if elem is None:
14            return ''
15        else:
16            return elem.strip()
17    strAttrib=''
18    for att in elem.attrib:
19        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
20    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
21    children=len(elem)
22    if children:
23        for item in elem:
24            result+='\n'+PrettyPrint(item,indent=indent+space)
25        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
26    else:
27        result+='</%s>'%(elem.tag)
28    return result
29
30def main(indir, outdir, keywords):
31    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
32        print "USAGE: keywordAdder indir, outdir, keywords "
33        print " where indir= full path of directory where MOLES records reside,"
34        print "       outdir= full path of where you want the updated records to go."
35        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
36        sys.exit()
37
38    if (len(keywords))%3 != 0:
39        print "Keywords must be in triples. keyword namespace key."
40        sys.exit()
41
42    print "INFO: moles records are in %s" %indir
43    print "INFO: moles records output to %s" %outdir
44    print "INFO: keywords to add are %s" %keywords
45
46    # initialise variables
47    numfilesproc = 0
48    keywordList=[]
49
50    #split the keywords into list of triples
51    count=0
52    while count < len(keywords)/3:
53        #print "count = %s. keywordList = %s" %(count, keywordList)
54        keywordList.append([])
55        keywordList[count].append(keywords[(count*3)])
56        keywordList[count].append(keywords[(count*3)+1])
57        keywordList[count].append(keywords[(count*3)+2])
58        count=count+1
59    #print "Final keywordList = %s" %keywordList
60
61    # moles skeleton for creating new objects
62    M=MRW.MolesDoc()
63
64    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
65    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
66
67    filenames = os.listdir(indir)
68    for filename in filenames:
69        if filename.find('.xml') != -1:
70            full_filename = indir + "/" + filename
71            dgMeta=MRW.dgMetadata()
72            try:
73                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
74            except:
75                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
76                continue
77
78            for keyword in keywordList:
79                strValidTerm= str(keyword[0])
80                strParentListID=str(keyword[1])
81                strTermID=str(keyword[2])
82                print strValidTerm, strParentListID, strTermID
83                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
84                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
85                dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
86
87            # now write out updated document
88            #print dir(dgMeta.dgMetadataRecord)
89            molestree=dgMeta.toXML()
90            moles=PrettyPrint(molestree)
91            f=open(outdir+"/"+filename,'w')
92            f.write(moles)
93            f.close()
94            numfilesproc += 1
95        else:
96            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
97
98    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
99
100
101if __name__=='__main__':
102    indir=sys.argv[1]
103    outdir=sys.argv[2]
104    keywords=sys.argv[3:]
105    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.