1 | #!/usr/bin/env python |
---|
2 | """ Script oai_ingest.py takes parameter <datacentre>. |
---|
3 | The /usr/local/WSClients/OAIBatch directory contains this python script, a config file |
---|
4 | and the oaiClean.py class which cleans up discovery records after harvesting. |
---|
5 | The pre-processed files are then ingested to the eXist XML db. |
---|
6 | |
---|
7 | Under this directory the following structure should be maintained: |
---|
8 | |
---|
9 | ./data |
---|
10 | - /DATACENTRE/ |
---|
11 | - discovery/: Records with namespace, schema declaration deleted - after having run |
---|
12 | the oaiClean script. Ready to ingest in the discovery service. |
---|
13 | - oai/difYYYYMMDD/ Records as harvested from OAI |
---|
14 | |
---|
15 | Where /DATACENTRE varies for the different data providers |
---|
16 | |
---|
17 | """ |
---|
18 | #History: |
---|
19 | # 12/05/06 SEL spelling correction |
---|
20 | # 30/05/06 SEL cope with many files for processing."Argument list too long" problem. |
---|
21 | # 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version). |
---|
22 | # 16/10/06 SEL Changed to using python oaiClean.py module instead of java code. |
---|
23 | # 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade. |
---|
24 | # 17/10/06 SEL cope with different discovery formats - not just DIF. |
---|
25 | # 23/10/06 SEL keywords not mandatory in config file. |
---|
26 | # 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running. |
---|
27 | |
---|
28 | import os |
---|
29 | import sys |
---|
30 | import commands |
---|
31 | import string |
---|
32 | import oaiClean |
---|
33 | |
---|
34 | status = 0 |
---|
35 | numfilesproc = 0 |
---|
36 | harvest_home = "" |
---|
37 | datacentre_groups = "" |
---|
38 | datacentre_format = "" |
---|
39 | |
---|
40 | if (len(sys.argv) < 2): |
---|
41 | print "<datacentre> parameter not supplied." |
---|
42 | sys.exit() |
---|
43 | else: |
---|
44 | datacentre = sys.argv[1] |
---|
45 | |
---|
46 | # Other settings and constants |
---|
47 | date_string = commands.getoutput ("date +'%y%m%d_%H%M'") |
---|
48 | os.putenv ('EXIST_HOME', '/usr/local/exist-client') |
---|
49 | os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/jre:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') |
---|
50 | os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch') |
---|
51 | |
---|
52 | # Get the harvested records directory and groups for this datacentre from the datacentre specific config file |
---|
53 | # The harvested records directory depends on the datacentres OAI base url, the set and format. These have to be know up-front. |
---|
54 | # The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records. |
---|
55 | # Groups are added to the xml record by oaiClean.py. |
---|
56 | datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" |
---|
57 | print "Datacentre config file = %s" %datacentre_config_filename |
---|
58 | datacentre_config_file = open(datacentre_config_filename, "r") |
---|
59 | |
---|
60 | for line in datacentre_config_file.readlines(): |
---|
61 | words = string.split(line) |
---|
62 | if len(words) == 0: |
---|
63 | continue |
---|
64 | if words[0] == 'host_path': |
---|
65 | harvest_home = string.rstrip(words[1]) |
---|
66 | if words[0] == 'groups': |
---|
67 | datacentre_groups = words[1:] |
---|
68 | if words[0] == 'format': |
---|
69 | datacentre_format = words[1] |
---|
70 | datacentre_config_file.close() |
---|
71 | if harvest_home == "": |
---|
72 | sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename) |
---|
73 | else: |
---|
74 | print "INFO: harvested records are in %s" %harvest_home |
---|
75 | |
---|
76 | if datacentre_groups == "": |
---|
77 | print "INFO: No groups/keywords set for datacentre %s" %datacentre |
---|
78 | else: |
---|
79 | print "INFO: datacentre groups/keywords = %s" %datacentre_groups |
---|
80 | |
---|
81 | if datacentre_format == "": |
---|
82 | sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename) |
---|
83 | else: |
---|
84 | print "INFO: format being harvested = %s" %datacentre_format |
---|
85 | |
---|
86 | #any records to harvest? |
---|
87 | if len( os.listdir(harvest_home)) == 0: |
---|
88 | print "Nothing to harvest this time from %s" %datacentre |
---|
89 | sys.exit() |
---|
90 | |
---|
91 | # The directory to put things for a tape backup (should already exist) |
---|
92 | backupdir = '/disks/glue1/oaiBackup/' |
---|
93 | |
---|
94 | # Create/clear the 'in' directory pristine copy of the discovery records |
---|
95 | if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"): |
---|
96 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}" |
---|
97 | print "Executing : " + commandline |
---|
98 | status = os.system(commandline) |
---|
99 | else: |
---|
100 | commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" |
---|
101 | print "Executing : " + commandline |
---|
102 | status= os.system(commandline) |
---|
103 | |
---|
104 | if status != 0: |
---|
105 | sys.exit("Failed at creating copy dir stage") |
---|
106 | |
---|
107 | # make the 'in' pristine copy. Cope with there being lots of files in the directory. |
---|
108 | |
---|
109 | commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals" |
---|
110 | print "Executing : " + commandline |
---|
111 | status = os.system(commandline) |
---|
112 | if status !=0: |
---|
113 | sys.exit("Failed at making pristine copy stage") |
---|
114 | |
---|
115 | # Create/clear the directory for the 'out' processed copy of the discovery records. |
---|
116 | if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"): |
---|
117 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}" |
---|
118 | print "Executing : " + commandline |
---|
119 | status = os.system(commandline) |
---|
120 | else: |
---|
121 | commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" |
---|
122 | print "Executing : " + commandline |
---|
123 | status= os.system(commandline) |
---|
124 | |
---|
125 | # Removed 16/10/06 - directory will hold 'out' processed records only |
---|
126 | # make the processing copy |
---|
127 | #commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" |
---|
128 | #print "Executing : " + commandline |
---|
129 | #status = os.system(commandline) |
---|
130 | #if status !=0: |
---|
131 | # sys.exit("Failed at making processing copy stage") |
---|
132 | |
---|
133 | # The file config.properties contains the location of the particular datacentres harvested records. |
---|
134 | # Copy the datacentre specific version of config to config.properties file. |
---|
135 | commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" |
---|
136 | print "Executing : " + commandline |
---|
137 | status = os.system(commandline) |
---|
138 | if status !=0: |
---|
139 | sys.exit("Failed at copying config file stage") |
---|
140 | |
---|
141 | #Change os directory to that with the oaiClean.py in it. (need this?) |
---|
142 | os.chdir('/usr/local/WSClients/OAIBatch') |
---|
143 | |
---|
144 | |
---|
145 | #Execute the script which processes the files |
---|
146 | indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" |
---|
147 | outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" |
---|
148 | wrapFlag=False |
---|
149 | filenames = os.listdir(indir) |
---|
150 | for filename in filenames: |
---|
151 | if filename.find('.xml') != -1: |
---|
152 | full_filename = indir + "/" + filename |
---|
153 | print "Processing : " + full_filename |
---|
154 | #try: |
---|
155 | oaiClean.oaiClean(indir,outdir,filename,wrapFlag) |
---|
156 | #except: |
---|
157 | # sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info())) |
---|
158 | # break |
---|
159 | numfilesproc += 1 |
---|
160 | else: |
---|
161 | print 'File %s is not xml format. Not processed' %(full_filename) |
---|
162 | |
---|
163 | # Removed 16/10/06 Don't need this anymore |
---|
164 | #Once the pre-processing has finished remove the originals from the discovery directory: |
---|
165 | #commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}" |
---|
166 | #print "Executing : " + commandline |
---|
167 | #status = os.system(commandline) |
---|
168 | #if status !=0: |
---|
169 | # sys.exit("Failed at removing original oai style records from discovery directory") |
---|
170 | |
---|
171 | |
---|
172 | # ingest the datacentres records into eXist db (backups of exist happen nightly). |
---|
173 | commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/" +datacentre_format+ "/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" |
---|
174 | print "Executing : actual command to ingest into exist db" |
---|
175 | status = os.system(commandline) |
---|
176 | if status !=0: |
---|
177 | sys.exit("Failed at ingesting into exist db. Datacentre = %s. Status = %s" %(datacentre,status)) |
---|
178 | |
---|
179 | #Make copies of discovery and oai/originals areas to backup area for tape backups |
---|
180 | this_backupdir = backupdir + datacentre + "_" + date_string + "_originals" |
---|
181 | commandline = "mkdir " + this_backupdir |
---|
182 | print "Executing : " + commandline |
---|
183 | status = os.system(commandline) |
---|
184 | if status !=0: |
---|
185 | sys.exit("Failed at creating backup directory %s" %this_backupdir) |
---|
186 | |
---|
187 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir |
---|
188 | print "Executing : " + commandline |
---|
189 | status = os.system(commandline) |
---|
190 | if status !=0: |
---|
191 | sys.exit("Failed at copying to backup directory %s" %this_backupdir) |
---|
192 | |
---|
193 | this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery" |
---|
194 | commandline = "mkdir " + this_backupdir |
---|
195 | print "Executing : " + commandline |
---|
196 | status = os.system(commandline) |
---|
197 | if status !=0: |
---|
198 | sys.exit("Failed at creating backup directory %s" %this_backupdir) |
---|
199 | |
---|
200 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir |
---|
201 | print "Executing : " + commandline |
---|
202 | status = os.system(commandline) |
---|
203 | if status !=0: |
---|
204 | sys.exit("Failed at copying to backup directory %s" %this_backupdir) |
---|
205 | |
---|
206 | #Clear out the original harvest records area |
---|
207 | commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}" |
---|
208 | print "Executing : " + commandline |
---|
209 | status = os.system(commandline) |
---|
210 | if status !=0: |
---|
211 | sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) |
---|
212 | |
---|
213 | |
---|
214 | print "======================================================" |
---|
215 | print "No. of files pre-processed = %s" %numfilesproc |
---|
216 | if status == 0: |
---|
217 | print " Procedure oai_ingest.py ran to end" |
---|
218 | else: |
---|
219 | print "Procedure oai_ingest.py FAILED with status %s" %status |
---|
220 | |
---|
221 | print "======================================================" |
---|