source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py @ 3909

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py@3909
Revision 3909, 4.5 KB checked in by cbyrom, 11 years ago (diff)

Add default logger use to keywordAdder + tidy up code.

Line 
1#!/usr/bin/env python
2'''
3adds Structured Keywords to moles documents
4'''
5try: #python 2.5
6    from xml.etree import ElementTree as etree
7except ImportError:
8    try:
9        # if you've installed it yourself it comes this way
10        import elementtree.ElementTree as etree
11#        import ElementTree as etree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import ElementTree as etree
15
16try: #python 2.5
17    from xml.etree import cElementTree
18except ImportError:
19    try:
20        # if you've installed it yourself it comes this way
21        import cElementTree
22    except ImportError:
23        # if you've egged it this is the way it comes
24        from ndgUtils.elementtree import cElementTree
25import molesReadWrite as MRW
26import sys, os, logging
27
28
29def PrettyPrint(elem,indent='',html=0,space='   '):
30    '''Lightweight pretty printing of elementTree elements'''
31    def estrip(elem):
32        ''' Just want to get rid of unwanted whitespace '''
33        if elem is None:
34            return ''
35        else:
36            return elem.strip()
37    strAttrib=''
38    for att in elem.attrib:
39        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
40    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
41    children=len(elem)
42    if children:
43        for item in elem:
44            result+='\n'+PrettyPrint(item,indent=indent+space)
45        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
46    else:
47        result+='</%s>'%(elem.tag)
48    return result
49
50def main(indir, outdir, keywords):
51    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
52        print "USAGE: keywordAdder indir, outdir, keywords "
53        print " where indir= full path of directory where MOLES records reside,"
54        print "       outdir= full path of where you want the updated records to go."
55        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
56        sys.exit()
57
58    if (len(keywords))%3 != 0:
59        print "ERROR: Keywords must be in triples: keyword namespace key."
60        print "Actual: ", keywords
61        sys.exit()
62
63    logging.info("moles records are in %s" %indir)
64    logging.info("moles records output to %s" %outdir)
65    logging.info("keywords to add are %s" %keywords)
66
67    # initialise variables
68    numfilesproc = 0
69    keywordList=[]
70
71    #split the keywords into list of triples
72    count=0
73    while count < len(keywords)/3:
74        keywordList.append([])
75        keywordList[count].append(keywords[(count*3)])
76        keywordList[count].append(keywords[(count*3)+1])
77        keywordList[count].append(keywords[(count*3)+2])
78        count=count+1
79
80    # moles skeleton for creating new objects
81    M=MRW.MolesDoc()
82
83    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
84    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
85
86    filenames = os.listdir(indir)
87    for filename in filenames:
88        if filename.find('.xml') != -1:
89            full_filename = indir + "/" + filename
90            dgMeta=MRW.dgMetadata()
91            try:
92                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
93                for keyword in keywordList:
94                    strValidTerm= str(keyword[0])
95                    strParentListID=str(keyword[1])
96                    strTermID=str(keyword[2])
97                    dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
98                    dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
99                    dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
100
101                # now write out updated document
102                molestree=dgMeta.toXML()
103                moles=PrettyPrint(molestree)
104                f=open(outdir+"/"+filename,'w')
105                f.write(moles)
106                f.close()
107                numfilesproc += 1
108            except Exception, detail:
109                logging.error("Cannot parse the XML moles document %s. Will not process" %full_filename)
110                logging.error("Detail: %s" %detail)
111                continue
112
113        else:
114            logging.error("File %s appears not to be XML. Will not be processed." %filename)
115
116    logging.info('keywordAdder.py ran to end. files processed= %s' %(numfilesproc))
117
118
119if __name__=='__main__':
120    indir=sys.argv[1]
121    outdir=sys.argv[2]
122    keywords=sys.argv[3:]
123    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.