1 | #!/usr/bin/env python |
---|
2 | """ Script oai_ingest.py takes parameter <datacentre>. |
---|
3 | The /usr/local/WSClients/OAIBatch directory contains this python script, a config file |
---|
4 | and some java which handle difs after harvesting. The pre-processed files are then ingested |
---|
5 | to the eXist XML db. |
---|
6 | |
---|
7 | Under this directory the following structure should be maintained: |
---|
8 | |
---|
9 | ./data |
---|
10 | - /DATACENTRE/ |
---|
11 | - discovery/: Records with namespace, schema declaration deleted. After having run the script. |
---|
12 | Ready to ingest in the discovery service. |
---|
13 | - oai/difYYYYMMDD/ Records as harvested from OAI |
---|
14 | |
---|
15 | Where /DATACENTRE varies for the different data providers |
---|
16 | |
---|
17 | """ |
---|
18 | #History: |
---|
19 | # 12/05/06 SEL spelling correction |
---|
20 | # 30/05/06 SEL cope with many files for processing."Argument list too long" problem. |
---|
21 | # 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version). |
---|
22 | |
---|
23 | import os |
---|
24 | import sys |
---|
25 | import commands |
---|
26 | import string |
---|
27 | |
---|
28 | status = 0 |
---|
29 | numfilesproc = 0 |
---|
30 | harvest_home = "" |
---|
31 | |
---|
32 | if (len(sys.argv) < 2): |
---|
33 | print "<datacentre> parameter not supplied." |
---|
34 | sys.exit() |
---|
35 | else: |
---|
36 | datacentre = sys.argv[1] |
---|
37 | |
---|
38 | # Other settings and constants |
---|
39 | date_string = commands.getoutput ("date +'%y%m%d_%H%M'") |
---|
40 | os.putenv ('EXIST_HOME', '/usr/local/eXist') |
---|
41 | os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.') |
---|
42 | os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch') |
---|
43 | |
---|
44 | # Get the harvested records directory for this datacentre from the config file for that data centre |
---|
45 | # The harvested records directory depends on the datacentres OAI base url, the set and format. These have to be know up-front. |
---|
46 | datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" |
---|
47 | print "Datacentre config file = %s" %datacentre_config_filename |
---|
48 | datacentre_config_file = open(datacentre_config_filename, "r") |
---|
49 | |
---|
50 | for line in datacentre_config_file.readlines(): |
---|
51 | words = string.split(line) |
---|
52 | if len(words) == 0: |
---|
53 | continue |
---|
54 | if words[0] == 'host_path': |
---|
55 | harvest_home = string.rstrip(words[1]) |
---|
56 | break |
---|
57 | |
---|
58 | if harvest_home == "": |
---|
59 | sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename) |
---|
60 | datacentre_config_file.close() |
---|
61 | |
---|
62 | if len( os.listdir(harvest_home)) == 0: |
---|
63 | print "Nothing to harvest this time from %s" %datacentre |
---|
64 | sys.exit() |
---|
65 | |
---|
66 | # The directory to put things for a tape backup (should already exist) |
---|
67 | backupdir = '/disks/glue1/oaiBackup/' |
---|
68 | |
---|
69 | # Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong |
---|
70 | if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"): |
---|
71 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\}" |
---|
72 | print "Executing : " + commandline |
---|
73 | status = os.system(commandline) |
---|
74 | else: |
---|
75 | commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy" |
---|
76 | print "Executing : " + commandline |
---|
77 | status= os.system(commandline) |
---|
78 | |
---|
79 | if status != 0: |
---|
80 | sys.exit("Failed at creating copy dir stage") |
---|
81 | |
---|
82 | # make the pristine copy. Cope with there being lots of files in the directory. |
---|
83 | |
---|
84 | commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/difcopy" |
---|
85 | print "Executing : " + commandline |
---|
86 | status = os.system(commandline) |
---|
87 | if status !=0: |
---|
88 | sys.exit("Failed at making pristine copy stage") |
---|
89 | |
---|
90 | # Create/clear the directory for the processing copy of the difs. |
---|
91 | if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"): |
---|
92 | commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/*" |
---|
93 | print "Executing : " + commandline |
---|
94 | status = os.system(commandline) |
---|
95 | else: |
---|
96 | commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" |
---|
97 | print "Executing : " + commandline |
---|
98 | status= os.system(commandline) |
---|
99 | |
---|
100 | # make the processing copy |
---|
101 | commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" |
---|
102 | print "Executing : " + commandline |
---|
103 | status = os.system(commandline) |
---|
104 | if status !=0: |
---|
105 | sys.exit("Failed at making processing copy stage") |
---|
106 | |
---|
107 | # The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar. |
---|
108 | # Copy the datacentre specific version of config to config.properties file. |
---|
109 | |
---|
110 | commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" |
---|
111 | print "Executing : " + commandline |
---|
112 | status = os.system(commandline) |
---|
113 | if status !=0: |
---|
114 | sys.exit("Failed at copying config file stage") |
---|
115 | |
---|
116 | #Change os directory to that with the java.jar in it. |
---|
117 | os.chdir('/usr/local/WSClients/OAIBatch') |
---|
118 | |
---|
119 | |
---|
120 | #Execute the script which processes the files |
---|
121 | filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery") |
---|
122 | for filename in filenames: |
---|
123 | if filename.find('.xml') != -1: |
---|
124 | full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename |
---|
125 | print "Processing : " + full_filename |
---|
126 | commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename) |
---|
127 | print commandline |
---|
128 | status= os.system(commandline) |
---|
129 | if status!=0: |
---|
130 | break |
---|
131 | numfilesproc += 1 |
---|
132 | else: |
---|
133 | print 'File %s is not xml format. Not processed' %(full_filename) |
---|
134 | if status!=0: |
---|
135 | sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status)) |
---|
136 | |
---|
137 | #The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files. |
---|
138 | #The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to |
---|
139 | #the filenames and it will leave <DIF> as the root element. |
---|
140 | # |
---|
141 | #Once the pre-processing has finished remove the originals from the discovery directory: |
---|
142 | commandline = "ls -1 usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* | xargs -i rm usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}" |
---|
143 | print "Executing : " + commandline |
---|
144 | status = os.system(commandline) |
---|
145 | if status !=0: |
---|
146 | sys.exit("Failed at removing original oai style records from discovery directory") |
---|
147 | |
---|
148 | # ingest the datacentres records into eXist db (backups of exist happen nightly). |
---|
149 | commandline = "client.sh -c /db/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" |
---|
150 | #print "Executing : " + commandline |
---|
151 | status = os.system(commandline) |
---|
152 | if status !=0: |
---|
153 | sys.exit("Failed at ingesting into exist db. Datacentre = %s. Status = %s" %(datacentre,status)) |
---|
154 | |
---|
155 | #Make copies of discovery and oai/difcopy areas to backup area for tape backups |
---|
156 | this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy" |
---|
157 | commandline = "mkdir " + this_backupdir |
---|
158 | print "Executing : " + commandline |
---|
159 | status = os.system(commandline) |
---|
160 | if status !=0: |
---|
161 | sys.exit("Failed at creating backup directory %s" %this_backupdir) |
---|
162 | |
---|
163 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\} " + this_backupdir |
---|
164 | print "Executing : " + commandline |
---|
165 | status = os.system(commandline) |
---|
166 | if status !=0: |
---|
167 | sys.exit("Failed at copying to backup directory %s" %this_backupdir) |
---|
168 | |
---|
169 | this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery" |
---|
170 | commandline = "mkdir " + this_backupdir |
---|
171 | print "Executing : " + commandline |
---|
172 | status = os.system(commandline) |
---|
173 | if status !=0: |
---|
174 | sys.exit("Failed at creating backup directory %s" %this_backupdir) |
---|
175 | commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir |
---|
176 | print "Executing : " + commandline |
---|
177 | status = os.system(commandline) |
---|
178 | if status !=0: |
---|
179 | sys.exit("Failed at copying to backup directory %s" %this_backupdir) |
---|
180 | |
---|
181 | #Clear out the original harvest records area |
---|
182 | commandline = "ls -1 " + harvest_home + " | xargs -i rm -f {\}" |
---|
183 | print "Executing : " + commandline |
---|
184 | status = os.system(commandline) |
---|
185 | if status !=0: |
---|
186 | sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) |
---|
187 | |
---|
188 | |
---|
189 | print "======================================================" |
---|
190 | print "No. of files pre-processed = %s" %numfilesproc |
---|
191 | if status == 0: |
---|
192 | print " Procedure oai_ingest.py ran to end" |
---|
193 | else: |
---|
194 | print "Procedure oai_ingest.py FAILED with status %s" %status |
---|
195 | |
---|
196 | print "======================================================" |
---|