Changeset 1754 for TI01-discovery


Ignore:
Timestamp:
23/11/06 17:41:59 (13 years ago)
Author:
selatham
Message:

made ndgdoc style wrapping optional. Turned off in script

Location:
TI01-discovery/trunk/ingestAutomation/OAIBatch
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oaiClean.py

    r1599 r1754  
    33# in the NDG eXist database ...  
    44# BNL, October 13, 2006 
    5 # 
     5# History: 
     6#  SEL 23/11/2006  Make wrapping optional - so can just remove namespace 
    67 
    78import os 
     
    1718        import ElementTree as ET 
    1819 
    19 def oaiClean(inputDirectoryName,outputDirectoryName,fileName,dcKeyWords=[]): 
     20def oaiClean(inputDirectoryName,outputDirectoryName,fileName,wrapFlag,dcKeyWords=[]): 
    2021     
    2122    ''' takes a file (fileName) in directory inputDirectoryName and cleans it up and writes 
    2223    it out in outputDirectoryName, with the following cleaning actions carried out: 
    2324        1) we strip namespaces, 
     25        Optionally 
    2426        2) we add the ndg dcKeywords (a list), and  
    2527        3) wrap it up in an instance of <ndgDoc> ''' 
     
    3436    inTree=ET.parse(open(fIn)) 
    3537     
    36     #create the output element and populate it with keywords 
    37     outTree=ET.Element('ndgDoc') 
    38     kws=ET.SubElement(outTree,'keywords') 
    39     for keyword in dcKeyWords:  
    40         k=ET.SubElement(kws,'keyword') 
    41         k.text=keyword 
    42     doc=ET.SubElement(outTree,'doc') 
    43      
    44      
    4538    #get the root element of the input and clean it up ... 
    4639    root=inTree.getroot() 
    47      
    48     # fix namespaces ...  
     40 
     41    # fix namespaces ... 
    4942    newDoc=removeNS(root) 
    50          
    51     #bif it in the output 
    52     doc.append(newDoc) 
    53      
    54     # this is wrong, we need to know what the input encoding is, but it'll do for now. 
    55     # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm 
    56     # which also has links to methods of recognising the encoding ... 
    57     output=ET.ElementTree(outTree) 
    58     output.write(fOut,encoding='UTF-8') 
    59      
     43 
     44 
     45    #if wrapping is indicated, create a wrapper output element and populate it with keywords 
     46    if wrapFlag==True: 
     47        outTree=ET.Element('ndgDoc') 
     48        kws=ET.SubElement(outTree,'keywords') 
     49        for keyword in dcKeyWords: 
     50                k=ET.SubElement(kws,'keyword') 
     51                k.text=keyword 
     52        doc=ET.SubElement(outTree,'doc') 
     53        #bif it in the output 
     54        doc.append(newDoc) 
     55        # this is wrong, we need to know what the input encoding is, but it'll do for now. 
     56        # if we do need to muck with it, see http://effbot.org/zone/celementtree-encoding.htm 
     57        # which also has links to methods of recognising the encoding ... 
     58        output=ET.ElementTree(outTree) 
     59        output.write(fOut,encoding='UTF-8') 
     60    else: 
     61        # otherwise just put the cleaned-up original in the output 
     62        output=ET.ElementTree(newDoc) 
     63        output.write(fOut,encoding='UTF-8') 
     64 
     65 
     66 
    6067def removeNS(e): 
    6168    ''' removes namespaces from element e ... the hardway, there must be an easier way ''' 
     
    7885        y.append(removeNS(child)) 
    7986    return y 
    80      
     87 
    8188 
    8289if __name__=="__main__": 
    8390    import sys 
    8491    try: 
    85         indir,outdir,filename=sys.argv[1:4] 
    86         if len(sys.argv)==5:  
    87             keywords=sys.argv[4].split(',') 
     92        indir,outdir,filename,wrapFlag=sys.argv[1:5] 
     93        if len(sys.argv)==6: 
     94            keywords=sys.argv[5].split(',') 
    8895        else: 
    8996            keywords=[] 
    9097    except: 
    9198        print sys.argv[1:] 
    92         print "usage: oaiClean indirectory outdirectory filename keyword1,keyword2,... " 
     99        print "usage: oaiClean indirectory outdirectory filename wrapFlag keyword1,keyword2,... " 
     100        print "     : wrapFlag expects a boolean." 
    93101        exit() 
    94     oaiClean(indir,outdir,filename,keywords) 
     102    oaiClean(indir,outdir,filename,wrapFlag,keywords) 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py

    r1620 r1754  
    133133# The file config.properties contains the location of the particular datacentres harvested records. 
    134134# Copy the datacentre specific version of config to config.properties file. 
    135  
    136135commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" 
    137136print "Executing : " + commandline 
     
    147146indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 
    148147outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 
     148wrapFlag=false 
    149149filenames = os.listdir(indir) 
    150150for filename in filenames: 
     
    153153                print "Processing : " + full_filename 
    154154                #try: 
    155                 oaiClean.oaiClean(indir,outdir,filename,datacentre_groups) 
     155                oaiClean.oaiClean(indir,outdir,filename,wrapFlag) 
    156156                #except: 
    157157                #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info())) 
Note: See TracChangeset for help on using the changeset viewer.