source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py @ 3797

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py@3797
Revision 3797, 4.5 KB checked in by cbyrom, 12 years ago (diff)

Upgraded version of ingest codebranch - including major refactoring of the ingest
scripts to make more OO - allowing re-use and simplification of code + removal of reliance
on eXist DB to store data; this will now all be stored and looked up from the Postgres DB

Line 
1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
3try: #python 2.5
4    from xml.etree import ElementTree as etree
5except ImportError:
6    try:
7        # if you've installed it yourself it comes this way
8        import elementtree.ElementTree as etree
9#        import ElementTree as etree
10    except ImportError:
11        # if you've egged it this is the way it comes
12        from ndgUtils.elementtree import ElementTree as etree
13
14try: #python 2.5
15    from xml.etree import cElementTree
16except ImportError:
17    try:
18        # if you've installed it yourself it comes this way
19        import cElementTree
20    except ImportError:
21        # if you've egged it this is the way it comes
22        from ndgUtils.elementtree import cElementTree
23import molesReadWrite as MRW
24import sys
25import os
26
27def PrettyPrint(elem,indent='',html=0,space='   '):
28    '''Lightweight pretty printing of elementTree elements'''
29    def estrip(elem):
30        ''' Just want to get rid of unwanted whitespace '''
31        if elem is None:
32            return ''
33        else:
34            return elem.strip()
35    strAttrib=''
36    for att in elem.attrib:
37        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
38    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
39    children=len(elem)
40    if children:
41        for item in elem:
42            result+='\n'+PrettyPrint(item,indent=indent+space)
43        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
44    else:
45        result+='</%s>'%(elem.tag)
46    return result
47
48def main(indir, outdir, keywords):
49    if len(sys.argv) < 2 or indir == "" or outdir == "" or keywords == []:
50        print "USAGE: keywordAdder indir, outdir, keywords "
51        print " where indir= full path of directory where MOLES records reside,"
52        print "       outdir= full path of where you want the updated records to go."
53        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
54        sys.exit()
55
56    if (len(keywords))%3 != 0:
57        print "Keywords must be in triples. keyword namespace key."
58        sys.exit()
59
60    print "INFO: moles records are in %s" %indir
61    print "INFO: moles records output to %s" %outdir
62    print "INFO: keywords to add are %s" %keywords
63
64    # initialise variables
65    numfilesproc = 0
66    keywordList=[]
67
68    #split the keywords into list of triples
69    count=0
70    while count < len(keywords)/3:
71        #print "count = %s. keywordList = %s" %(count, keywordList)
72        keywordList.append([])
73        keywordList[count].append(keywords[(count*3)])
74        keywordList[count].append(keywords[(count*3)+1])
75        keywordList[count].append(keywords[(count*3)+2])
76        count=count+1
77    #print "Final keywordList = %s" %keywordList
78
79    # moles skeleton for creating new objects
80    M=MRW.MolesDoc()
81
82    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
83    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
84
85    filenames = os.listdir(indir)
86    for filename in filenames:
87        if filename.find('.xml') != -1:
88            full_filename = indir + "/" + filename
89            dgMeta=MRW.dgMetadata()
90            try:
91                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
92            except:
93                print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
94                continue
95
96            for keyword in keywordList:
97                strValidTerm= str(keyword[0])
98                strParentListID=str(keyword[1])
99                strTermID=str(keyword[2])
100                print strValidTerm, strParentListID, strTermID
101                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
102                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
103                dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
104
105            # now write out updated document
106            #print dir(dgMeta.dgMetadataRecord)
107            molestree=dgMeta.toXML()
108            moles=PrettyPrint(molestree)
109            f=open(outdir+"/"+filename,'w')
110            f.write(moles)
111            f.close()
112            numfilesproc += 1
113        else:
114            print "WARNING: File %s appears not to be XML. Will not be processed." %filename
115
116    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
117
118
119if __name__=='__main__':
120    indir=sys.argv[1]
121    outdir=sys.argv[2]
122    keywords=sys.argv[3:]
123    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.