source: TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py @ 2066

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py@2066
Revision 2066, 3.3 KB checked in by selatham, 13 years ago (diff)

continued development.

Line 
1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
3import cElementTree
4import elementtree.ElementTree as etree
5import molesReadWrite as MRW
6import sys
7import os
8
9def PrettyPrint(elem,indent='',html=0,space='   '):
10    '''Lightweight pretty printing of elementTree elements'''
11    def estrip(elem):
12        ''' Just want to get rid of unwanted whitespace '''
13        if elem is None:
14            return ''
15        else:
16            return elem.strip()
17    strAttrib=''
18    for att in elem.attrib:
19        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
20    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
21    children=len(elem)
22    if children:
23        for item in elem:
24            result+='\n'+PrettyPrint(item,indent=indent+space)
25        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
26    else:
27        result+='</%s>'%(elem.tag)
28    return result
29
30def main(indir, outdir, keywords):
31    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == "":
32        print "USAGE: keywordAdder(indir, outdir, keywords) "
33        print " where indir= full path of directory where MOLES records reside,"
34        print "       outdir= full path of where you want the updated records to go."
35        print "       keywords = nested list of [[keyword, namespace, key],...] which need to be added"
36        sys.exit
37
38    print "INFO: moles records are in %s" %indir
39    print "INFO: moles records output to %s" %outdir
40    print "INFO: keywords to add are %s" %keywords
41
42    # initialise variables
43    numfilesproc = 0
44
45    # moles skeleton for creating new objects
46    M=MRW.MolesDoc()
47
48    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
49    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
50
51    filenames = os.listdir(indir)
52    for filename in filenames:
53        if filename.find('.xml') != -1:
54            full_filename = indir + "/" + filename
55            dgMeta=MRW.dgMetadata()
56            try:
57                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
58            except:
59                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
60                continue
61
62            strValidTerm= str(keywords[1][0])
63            strParentListID=str(keywords[1][1])
64            strTermID=str(keywords[1][2])
65            print strValidTerm, strParentListID, strTermID
66            dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
67            dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
68            dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
69
70            # now write out updated document
71            print dir(dgMeta.dgMetadataRecord)
72            molestree=dgMeta.toXML()
73            moles=PrettyPrint(molestree)
74            f=open(outdir+"/"+filename,'w')
75            f.write(moles)
76            f.close()
77            numfilesproc += 1
78        else:
79            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
80
81    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
82
83
84if __name__=='__main__':
85    indir=sys.argv[1]
86    outdir=sys.argv[2]
87    keywords=sys.argv[3:]
88    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.