source: TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py @ 1869

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/keywordAdder.py@1869
Revision 1869, 3.4 KB checked in by selatham, 14 years ago (diff)

further work on ingest

Line 
1#!/usr/bin/env python
2# keywordAdder - adds Structered Keywords to moles documents
3import cElementTree
4import elementtree.ElementTree as etree
5import molesReadWrite as MRW
6import sys
7import os
8
9def PrettyPrint(elem,indent='',html=0,space='   '):
10        '''Lightweight pretty printing of elementTree elements'''
11        def estrip(elem):
12                ''' Just want to get rid of unwanted whitespace '''
13                if elem is None:
14                        return ''
15                else:
16                        return elem.strip()
17        strAttrib=''
18        for att in elem.attrib:
19                strAttrib+=' %s="%s"'%(att,elem.attrib[att])
20        result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
21        children=len(elem)
22        if children:
23                for item in elem:
24                        result+='\n'+PrettyPrint(item,indent=indent+space)
25                result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
26        else:
27                result+='</%s>'%(elem.tag)
28        return result
29
30def addNewElement(parentobject, childname, child):
31    if hasattr(parentobject, childname):
32        currentattribute=getattr(parentobject,childname)
33        if getattr(parentobject,childname) is list:
34            currentattribute.append(child)
35
36        else:
37            newlist=[currentattribute]
38            newlist.append(child)
39            setattr(parentobject,childname, newlist)
40    else:
41        setattr(parentobject,childname, child)
42
43def main(indir, outdir, keywords):
44    if len(sys.argv) < 3 or indir == "" or outdir == "" or keywords == "":
45       print "USAGE: keywordAdder(indir, outdir, keywords) "
46       print " where indir= full path of directory where MOLES records reside,"
47       print "       outdir= full path of where you want the updated records to go."
48       print "       keywords = nested list of [[keyword, namespace, key],...] which need to be added"
49       sys.exit
50
51    print "INFO: moles records are in %s" %indir
52    print "INFO: moles records output to %s" %outdir
53    print "INFO: keywords to add are %s" %keywords
54
55    # initialise variables
56    numfilesproc = 0
57
58    # moles skeleton for creating new objects
59    M=MRW.MolesDoc()
60
61    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
62    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
63
64    filenames = os.listdir(indir)
65    for filename in filenames:
66        if filename.find('.xml') != -1:
67                full_filename = indir + "/" + filename
68                dgMeta=MRW.dgMetadata()
69                try:
70                    dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
71                except:
72                    print "WARNING: Cannot parse the XML moles document %s. Will not process" %full_filename
73                    continue
74
75                strValidTerm= str(keywords[0])
76                strParentListID=str(keywords[1])
77                strTermID=str(keywords[2])
78                print strValidTerm, strParentListID, strTermID
79                dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
80                dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
81                addNewElement(dgMeta.dgMetadataRecord, 'dgStructuredKeyword', dgSK)
82
83                # now write out updated document
84                molestree=dgMeta.toXML()
85                moles=PrettyPrint(molestree)
86                f=open(outdir+"/"+filename,'w')
87                f.write(moles)
88                f.close()
89                numfilesproc += 1
90        else:
91                print "WARNING: File %s appears not to be XML. Will not be processed." %filename
92
93    print 'INFO: keywordAdder.py ran to end. files processed= %s' %(numfilesproc)
94
95
96if __name__=='__main__':
97        indir=sys.argv[1]
98        outdir=sys.argv[2]
99        keywords=sys.argv[3:]
100        main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.