source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py @ 3817

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py@3817
Revision 3817, 4.5 KB checked in by cbyrom, 11 years ago (diff)

Add default logging support + create new version of ingest script, removing
all traces of the eXist DB + improve documentation and output.

Line 
1#!/usr/bin/env python
2'''
3adds Structured Keywords to moles documents
4'''
5try: #python 2.5
6    from xml.etree import ElementTree as etree
7except ImportError:
8    try:
9        # if you've installed it yourself it comes this way
10        import elementtree.ElementTree as etree
11#        import ElementTree as etree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import ElementTree as etree
15
16try: #python 2.5
17    from xml.etree import cElementTree
18except ImportError:
19    try:
20        # if you've installed it yourself it comes this way
21        import cElementTree
22    except ImportError:
23        # if you've egged it this is the way it comes
24        from ndgUtils.elementtree import cElementTree
25import molesReadWrite as MRW
26import sys
27import os
28
29def PrettyPrint(elem,indent='',html=0,space='   '):
30    '''Lightweight pretty printing of elementTree elements'''
31    def estrip(elem):
32        ''' Just want to get rid of unwanted whitespace '''
33        if elem is None:
34            return ''
35        else:
36            return elem.strip()
37    strAttrib=''
38    for att in elem.attrib:
39        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
40    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
41    children=len(elem)
42    if children:
43        for item in elem:
44            result+='\n'+PrettyPrint(item,indent=indent+space)
45        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
46    else:
47        result+='</%s>'%(elem.tag)
48    return result
49
50def main(indir, outdir, keywords):
51    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
52        print "USAGE: keywordAdder indir, outdir, keywords "
53        print " where indir= full path of directory where MOLES records reside,"
54        print "       outdir= full path of where you want the updated records to go."
55        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
56        sys.exit()
57
58    if (len(keywords))%3 != 0:
59        print "Keywords must be in triples. keyword namespace key."
60        sys.exit()
61
62    print "INFO: moles records are in %s" %indir
63    print "INFO: moles records output to %s" %outdir
64    print "INFO: keywords to add are %s" %keywords
65
66    # initialise variables
67    numfilesproc = 0
68    keywordList=[]
69
70    #split the keywords into list of triples
71    count=0
72    while count < len(keywords)/3:
73        #print "count = %s. keywordList = %s" %(count, keywordList)
74        keywordList.append([])
75        keywordList[count].append(keywords[(count*3)])
76        keywordList[count].append(keywords[(count*3)+1])
77        keywordList[count].append(keywords[(count*3)+2])
78        count=count+1
79    #print "Final keywordList = %s" %keywordList
80
81    # moles skeleton for creating new objects
82    M=MRW.MolesDoc()
83
84    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
85    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
86
87    filenames = os.listdir(indir)
88    for filename in filenames:
89        if filename.find('.xml') != -1:
90            full_filename = indir + "/" + filename
91            dgMeta=MRW.dgMetadata()
92            try:
93                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
94            except:
95                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
96                continue
97
98            for keyword in keywordList:
99                strValidTerm= str(keyword[0])
100                strParentListID=str(keyword[1])
101                strTermID=str(keyword[2])
102                print strValidTerm, strParentListID, strTermID
103                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
104                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
105                dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
106
107            # now write out updated document
108            #print dir(dgMeta.dgMetadataRecord)
109            molestree=dgMeta.toXML()
110            moles=PrettyPrint(molestree)
111            f=open(outdir+"/"+filename,'w')
112            f.write(moles)
113            f.close()
114            numfilesproc += 1
115        else:
116            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
117
118    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
119
120
121if __name__=='__main__':
122    indir=sys.argv[1]
123    outdir=sys.argv[2]
124    keywords=sys.argv[3:]
125    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.