source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py @ 4854

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/keywordAdder.py@6130
Revision 4854, 3.9 KB checked in by cbyrom, 11 years ago (diff)

Add new ingest script - to allow ingest of DIF docs from eXist hosted
atom feed. NB, this required restructure of original OAI harvester
to allow re-use of shared code - by abstracting this out into new class,
absstractdocumentingester.

Add new documentation and tidy up codebase removing dependencies where possible to simplify things.

Line 
1#!/usr/bin/env python
2'''
3adds Structured Keywords to moles documents
4'''
5from xml.etree import ElementTree as etree
6from xml.etree import cElementTree
7import csml.csml2Moles.molesReadWrite as MRW
8import sys, os, logging
9
10
11def PrettyPrint(elem,indent='',html=0,space='   '):
12    '''Lightweight pretty printing of elementTree elements'''
13    def estrip(elem):
14        ''' Just want to get rid of unwanted whitespace '''
15        if elem is None:
16            return ''
17        else:
18            return elem.strip()
19    strAttrib=''
20    for att in elem.attrib:
21        strAttrib+=' %s="%s"'%(att,elem.attrib[att])
22    result='%s<%s%s>%s'%(indent,elem.tag,strAttrib,estrip(elem.text))
23    children=len(elem)
24    if children:
25        for item in elem:
26            result+='\n'+PrettyPrint(item,indent=indent+space)
27        result+='\n%s%s</%s>'%(indent,estrip(item.tail),elem.tag)
28    else:
29        result+='</%s>'%(elem.tag)
30    return result
31
32def main(indir, outdir, keywords):
33    if not indir or not outdir or not keywords:
34        print "USAGE: keywordAdder indir, outdir, keywords "
35        print " where indir= full path of directory where MOLES records reside,"
36        print "       outdir= full path of where you want the updated records to go."
37        print "       keywords = list triples:- keyword, namespace, key. Must be multiple of three."
38        sys.exit()
39
40    if (len(keywords))%3 != 0:
41        print "ERROR: Keywords must be in triples: keyword namespace key."
42        print "Actual: ", keywords
43        sys.exit()
44
45    logging.info("moles records are in %s" %indir)
46    logging.info("moles records output to %s" %outdir)
47    logging.info("keywords to add are %s" %keywords)
48
49    # initialise variables
50    numfilesproc = 0
51    keywordList=[]
52
53    #split the keywords into list of triples
54    count=0
55    while count < len(keywords)/3:
56        keywordList.append([])
57        keywordList[count].append(keywords[(count*3)])
58        keywordList[count].append(keywords[(count*3)+1])
59        keywordList[count].append(keywords[(count*3)+2])
60        count=count+1
61
62    # moles skeleton for creating new objects
63    M=MRW.MolesDoc()
64
65    #this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
66    etree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
67
68    filenames = os.listdir(indir)
69    for filename in filenames:
70        if filename.find('.xml') != -1:
71            full_filename = indir + "/" + filename
72            dgMeta=MRW.dgMetadata()
73            try:
74                dgMeta.fromXML(cElementTree.ElementTree(file=full_filename).getroot())
75                for keyword in keywordList:
76                    strValidTerm= str(keyword[0])
77                    strParentListID=str(keyword[1])
78                    strTermID=str(keyword[2])
79                    dgVTID=M.dgValidTermID(ParentListID=strParentListID, TermID=strTermID)
80                    dgSK=M.dgStructuredKeyword(dgValidTerm=strValidTerm, dgValidTermID=dgVTID)
81                    dgMeta.dgMetadataRecord.addChildElem('dgStructuredKeyword', dgSK)
82
83                # now write out updated document
84                molestree=dgMeta.toXML()
85                moles=PrettyPrint(molestree)
86                f=open(outdir+"/"+filename,'w')
87                f.write(moles)
88                f.close()
89                numfilesproc += 1
90            except Exception, detail:
91                logging.error("Cannot parse the XML moles document %s. Will not process" %full_filename)
92                logging.error("Detail: %s" %detail)
93                continue
94
95        else:
96            logging.error("File %s appears not to be XML. Will not be processed." %filename)
97
98    logging.info('keywordAdder.py ran to end. files processed= %s' %(numfilesproc))
99
100
101if __name__=='__main__':
102    indir=sys.argv[1]
103    outdir=sys.argv[2]
104    keywords=sys.argv[3:]
105    main(indir,outdir, keywords)
Note: See TracBrowser for help on using the repository browser.