source: exist/trunk/xquery/dif2moles.xq @ 4337

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/xquery/dif2moles.xq@4337
Revision 4337, 18.9 KB checked in by sdonegan, 11 years ago (diff)

Updated ingest pipeline xqueries that allow proper translation of urls and datacentre names from the gamut of different dif and moles we have available....

Line 
1(: This query produces one MOLES data entity plus one organisation entry for a given DIF instance
2   Input is TargetCollection (where the DIF exists), RepositoryID, where the existing DIF lies, and
3   expected to be the output RepositoryID (fix it in the output xml if it's wrong), and Input_EntryID
4   which is the DIF entryID, and LocalID which is the output localID for the data entity. We also
5   need the localID for the organisation entity, which appears as repository_localid
6   :)
7(: Note algoritm for creating non-pre-existing organisations :)
8(: dgPersons are not created as one can't tell automatically which are people and which are orgs, and orgs are simpler :) 
9
10(:
11import module namespace voclib='http://ndg.nerc.ac.uk/xquery/lib/vocab' at 'xmldb:exist:///db/xqueryLib/Vocabs/vocab_xquery_lib.xquery';
12import module namespace inputParse='http://ndg.nerc.ac.uk/xquery/lib/inputParse' at 'xmldb:exist:///db/xqueryLib/Utilities/inputParse_xquery_lib.xquery';
13:)
14import module namespace voclib='http://ndg.nerc.ac.uk/xquery/lib/vocab' at 'vocab_xquery_lib.xquery';
15import module namespace inputParse='http://ndg.nerc.ac.uk/xquery/lib/inputParse' at 'inputParse_xquery_lib.xquery';
16
17
18declare default element namespace 'http://ndg.nerc.ac.uk/moles';
19declare namespace dif='http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/';
20declare namespace xsi='http://www.w3.org/2001/XMLSchema-instance';
21
22(: Replacable parameters :)
23declare variable $targetCollection as xs:string := 'TargetCollection';
24declare variable $input_repository as xs:string := 'RepositoryID';
25declare variable $input_repository_local as xs:string := 'repository_localid';
26declare variable $input_entry_id as xs:string := 'Input_Entry_ID';
27declare variable $output_local_id as xs:string := 'LocalID';
28
29(: SJD get current date and manipulate to correct format as raw current-date not useable in MOLES:)
30declare variable $currentDate as xs:string := substring(current-date() cast as xs:string,1,10);
31(:declare variable $extractDate as xs:string := substring($fullDate,1,10);:)
32
33(:for $DIF in collection(collection($targetCollection))/dif:DIF[dif:Entry_ID=$input_entry_id]:)
34for $DIF in collection($targetCollection)/dif:DIF[dif:Entry_ID=$input_entry_id]
35return
36element dgMetadata {
37element dgMetadataRecord {
38element dgMetadataID {
39element schemeIdentifier {'NDG-B0'},
40element repositoryIdentifier {$input_repository},
41if ($output_local_id != 'Output_LocalID') then
42element localIdentifier {$output_local_id}
43else
44element localIdentifier {data($DIF/dif:Entry_ID)}
45},
46element dgMetadataDescription {
47element metadataDescriptionID {
48element schemeIdentifier {'NDG-B0'},
49element repositoryIdentifier {$input_repository},
50if ($output_local_id != 'Output_LocalID') then
51element localIdentifier {concat('generated_desc-', $output_local_id)}
52else
53element localIdentifier {concat('generated_desc-', encode-for-uri(string($DIF/dif:Entry_ID)))}
54},
55element metadataDescriptionLastUpdated {$currentDate}                   ,
56    element abstract {
57        element abstractText {string($DIF/dif:Summary)}
58    }
59    ,
60    for $descOnline in $DIF/dif:Related_URL
61    return
62        element descriptionSection {
63            element descriptionOnlineReference {
64            element dgSimpleLink {
65                (:SJD changed to name picking up from Description element in DIF?? :)
66               (: element URL {encode-for-uri(data($descOnline/dif:URL))},:)
67                if (exists($descOnline/dif:Description)) then
68                    element name {data($descOnline/dif:Description)},
69                    element URL {encode-for-uri(data($descOnline/dif:URL))}
70                else
71                    element name {data('URL')}
72                    (:element URL {encode-for-uri(data($descOnline/dif:URL))}:)
73                } }             
74            }   
75        },
76        element name {string($DIF/dif:Entry_Title)},
77        element abbreviation {string($DIF/dif:Entry_Title)},
78        element dgDataEntity {
79        element dgDataSetType {''},
80        element dgDataSummary {
81        for $parameter in $DIF/dif:Parameters
82        return
83            element dgParameterSummary {
84                element dgParameterValue {
85                    element dgValueDataParameter {
86                        element Value {''},
87                        element dgStandardUnit {
88                            element dgValidTerm {'dummy'},
89                            element dgValidTermID {
90                                element ParentListID {$voclib:unknown_vocab_id},
91                                element TermID {encode-for-uri('dummy unit')}
92                            }
93                        }
94                    }
95                },
96            element dgStdParameterMeasured {
97                if ($parameter/dif:Category!='') then element dgValidTerm {string($parameter/dif:Category)}
98                else  element dgValidTerm {'unknown'},
99                    element dgValidTermID {
100                        element ParentListID {concat($voclib:gcmd_science_valids_categories, '/current')},
101                        element TermID {encode-for-uri($parameter/dif:Category)}
102                    },
103                    element dgValidSubterm {
104                        if ($parameter/dif:Topic!='') then element dgValidTerm {string($parameter/dif:Topic)}
105                        else  element dgValidTerm {'unknown'},
106                            element dgValidTermID {
107                                element ParentListID {concat($voclib:gcmd_science_valids_topics, '/current')},
108                                element TermID {encode-for-uri($parameter/dif:Topic)}
109                            },
110                    element dgValidSubterm {
111                        if ($parameter/dif:Term!='') then element dgValidTerm {string($parameter/dif:Term)}
112                        else  element dgValidTerm {'unknown'},
113                            element dgValidTermID {
114                                element ParentListID {concat($voclib:gcmd_science_valids_terms, '/current')},
115                                element TermID {encode-for-uri($parameter/dif:Term)}
116                            },
117                        if (exists($parameter/dif:Variable) and $parameter/dif:Variable!='') then
118                            element dgValidSubterm {
119                                element dgValidTerm {string($parameter/dif:Variable)},
120                                element dgValidTermID {
121                                    element ParentListID {concat($voclib:gcmd_science_valids_variables, '/current')},
122                                    element TermID {encode-for-uri($parameter/dif:Variable)}
123                                },
124                        if (exists($parameter/dif:Detailed_Variable) and $parameter/dif:Detailed_Variable!='') then
125                            element dgValidSubterm {
126                                element dgValidTerm {string($parameter/dif:Detailed_Variable)},
127                                element dgValidTermID {
128                                    element ParentListID {$voclib:unknown_vocab_id},
129                                    element TermID {encode-for-uri($parameter/dif:Detailed_Variable)}
130                                },
131                        element ListLevel {4}}
132                        else (),
133                        element ListLevel {3}
134                        }
135                        else (),
136                        element ListLevel {2}
137                        },
138                        element ListLevel {1}
139                        },
140                        element ListLevel {0}
141                        },
142                element ParameterName {
143                    concat (string($parameter/dif:Category), ' > ', string($parameter/dif:Topic), ' > ', string($parameter/dif:Term)),
144                    if (exists($parameter/dif:Variable) and $parameter/dif:Variable!='') then
145                    concat(' > ', string($parameter/dif:Variable))
146                    else (),
147                    if (exists($parameter/dif:Detailed_Variable) and $parameter/dif:Detailed_Variable!='') then
148                    concat(' > ', string($parameter/dif:Detailed_Variable))
149                    else ()
150                },
151                element ParameterAbbreviation {
152                    concat (string($parameter/dif:Category), ' > ', string($parameter/dif:Topic), ' > ', string($parameter/dif:Term)),
153                    if (exists($parameter/dif:Variable) and $parameter/dif:Variable!='') then
154                    concat(' > ', string($parameter/dif:Variable))
155                    else (),
156                    if (exists($parameter/dif:Detailed_Variable) and $parameter/dif:Detailed_Variable!='') then
157                    concat(' > ', string($parameter/dif:Detailed_Variable))
158                    else ()
159                }
160            }, (: end of dgParameterSummary :)
161if (exists($DIF/dif:Spatial_Coverage) or exists($DIF/dif:Paleo_Temporal_Coverage) or exists($DIF/dif:Location) or exists($DIF/dif:Temporal_Coverage)) then
162element dgDataCoverage {
163    if (exists($DIF/dif:Spatial_Coverage)  or exists($DIF/dif:Location))  then
164    element dgSpatialCoverage {
165        for $boundingbox in $DIF/dif:Spatial_Coverage[exists(dif:Northernmost_Latitude)
166            and exists(dif:Southernmost_Latitude)
167            and exists(dif:Easternmost_Longitude)
168            and exists(dif:Westernmost_Longitude)]
169            return
170                element BoundingBox {
171                    element LimitNorth {data(inputParse:fix-coord($boundingbox/dif:Northernmost_Latitude))},
172                    element LimitSouth {data(inputParse:fix-coord($boundingbox/dif:Southernmost_Latitude))},
173                    element LimitWest {data(inputParse:fix-coord($boundingbox/dif:Westernmost_Longitude))},
174                    element LimitEast {data(inputParse:fix-coord($boundingbox/dif:Easternmost_Longitude))}
175                },
176        for $location in $DIF/dif:Location
177        return
178            element dgArea {
179                element dgValidTerm {string($location)},
180                element dgValidTermID {
181                element ParentListID {concat($voclib:gcmd_location_valids, '/current')},
182                element TermID {encode-for-uri($location)}
183            }
184        }
185    }
186    else (),
187    if (exists($DIF/dif:Temporal_Coverage) or exists($DIF/dif:Paleo_Temporal_Coverage) or exists($DIF/dif:Chronostratigraphic_Unit)) then
188        element dgTemporalCoverage {
189               for $temporalcoverage in $DIF/dif:Temporal_Coverage[exists(dif:Start_Date)]
190        return
191            element DateRange {
192                element DateRangeStart {string($temporalcoverage/dif:Start_Date)},
193                element DateRangeEnd {string($temporalcoverage/dif:Stop_Date)}
194            },
195            for $paleotemporalcoverage in $DIF/dif:Paleo_Temporal_Coverage[exists(dif:Paleo_Start_Date)]
196            return
197                element DateRange {
198                element DateRangeStart {string($paleotemporalcoverage/dif:Paleo_Start_Date)},
199                element DateRangeEnd {string($paleotemporalcoverage/dif:Paleo_Stop_Date)}
200                },
201            for $chronostratigraphic in $DIF/dif:Chronostratigraphic_Unit
202            return
203                element dgChronostratigraphicTerm {
204                    element dgValidTerm {string($chronostratigraphic)},
205                        element dgValidTermID {
206                            element ParentListID {concat($voclib:gcmd_chronostratigraphic_valids, '/current')},
207                            element TermID {encode-for-uri($chronostratigraphic)}
208                        }
209                     }
210                }
211            else ()
212        }
213    else ()
214},
215element dgDataRoles {
216if (exists($DIF/dif:Data_Set_Citation/dif:Dataset_Creator)) then
217element dgDataCreator {
218element dgMetadataID {
219element schemeIdentifier {'NDG-B0'},
220element repositoryIdentifier {$input_repository},
221if ($output_local_id != 'Output_LocalID') then
222element localIdentifier {concat('generated_creator-', $output_local_id)}
223else
224element localIdentifier {concat('generated_creator-', encode-for-uri(string($DIF/dif:Entry_ID)))}
225},
226element roleName {'Data Creator'},
227element abbreviation {'Creator'},
228for $creatorID in $DIF/dif:Data_Set_Citation/dif:Dataset_Creator
229return
230element dgRoleHolder {
231(: SJD changed this to dgOrganisationID from dgMetadataID - assuming simple error :)
232    element dgOrganisationID {
233        element schemeIdentifier {'NDG-B0'},
234        element repositoryIdentifier {$input_repository},
235        if ($output_local_id != 'Output_LocalID') then
236            element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creatorID), '-', $output_local_id))}
237        else
238            element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creatorID), '-', data($DIF/dif:Entry_ID)))}
239        },
240       element startDate {current-date()}
241    }
242}
243else if (exists($DIF/dif:Originating_Center)) then
244element dgDataCreator {
245element dgMetadataID {
246element schemeIdentifier {'NDG-B0'},
247element repositoryIdentifier {$input_repository},
248if ($output_local_id != 'Output_LocalID') then
249element localIdentifier  {encode-for-uri(concat('generated_creator-', $output_local_id))}
250else
251element localIdentifier  {encode-for-uri(concat('generated_creator-', data($DIF/dif:Entry_ID)))}
252},
253element roleName {'Data Creator'},
254element abbreviation {'Creator'},
255for $creatorID in $DIF/dif:Originating_Center
256return
257element dgRoleHolder {
258element dgOrganisationID {
259element schemeIdentifier {'NDG-B0'},
260element repositoryIdentifier {$input_repository},
261if ($output_local_id != 'Output_LocalID') then
262element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creatorID), '-', $output_local_id))}
263else
264element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creatorID), '-', data($DIF/dif:Entry_ID)))}
265},
266element startDate {current-date()}
267}
268}
269else (),
270element dgDataCurator {
271    element dgMetadataID {
272        element schemeIdentifier {'NDG-B0'},
273        element repositoryIdentifier {$input_repository},
274        if ($output_local_id != 'Output_LocalID') then
275            element localIdentifier  {encode-for-uri(concat('generated_curator-', $output_local_id))}
276        else
277            element localIdentifier  {encode-for-uri(concat('generated_curator-', data($DIF/dif:Entry_ID)))}
278    },
279    element roleName {'Data Curator'},
280    element abbreviation {'Curator'},
281    element dgRoleHolder {
282        element dgOrganisationID {
283            element schemeIdentifier {'NDG-B0'},
284            element repositoryIdentifier {$input_repository},
285            (:SJD this not valid - just use same localIdentifier as above..:)
286            (:element localIdentifier {$input_repository_local}:)
287            if ($output_local_id != 'Output_LocalID') then
288            element localIdentifier  {encode-for-uri(concat('generated_curator-', $output_local_id))}
289            else
290            element localIdentifier  {encode-for-uri(concat('generated_curator-', data($DIF/dif:Entry_ID)))}
291        },
292    element startDate {current-date()}
293}
294}
295}
296},
297element dgStructuredKeyword {
298element dgValidTerm {'d2b converted record'},
299element dgValidTermID {
300element ParentListID {$voclib:unknown_vocab_id},
301element TermID {'d2b'}
302}
303},
304for $structuredKeywords in $DIF/dif:Keyword
305return
306element dgStructuredKeyword {
307element dgValidTerm {string($structuredKeywords)},
308element dgValidTermID {
309if ($structuredKeywords='MDIP' or $structuredKeywords='NERC' or $structuredKeywords='NERC_DDC' or $structuredKeywords='DPPP') then
310element ParentListID {concat($voclib:ndg_data_provider_vocab, '/current')}
311else
312element ParentListID {$voclib:unknown_vocab_id}
313,
314element TermID {encode-for-uri($structuredKeywords)}
315}
316},
317for $structuredKeywords in $DIF/dif:ISO_Topic_Category
318return
319element dgStructuredKeyword {
320element dgValidTerm {string($structuredKeywords)},
321element dgValidTermID {
322element ParentListID {concat($voclib:iso_topic_list, '/current')},
323element TermID {encode-for-uri($structuredKeywords)}
324}
325},
326if (exists($DIF/dif:DIF_Creation_Date) or exists($DIF/dif:Last_DIF_Revision_Date)) then
327element dgMetadataProvenance {
328if (exists($DIF/dif:DIF_Creation_Date)) then
329element RecordCreation {
330element CreatedDate {
331if (string($DIF/dif:DIF_Creation_Date) castable as xs:date) then
332string($DIF/dif:DIF_Creation_Date) cast as xs:date
333else (current-date())
334},
335element CreatedBy {$input_repository}
336}
337else
338element RecordCreation {
339element CreatedDate {current-date()},
340element CreatedBy {'MOLES Import'}
341},
342if (exists($DIF/dif:Last_DIF_Revision_Date)) then
343element RecordUpdate {
344element UpdateDate {
345if (string($DIF/dif:Last_DIF_Revision_Date) castable as xs:dateTime) then
346string($DIF/dif:Last_DIF_Revision_Date) cast as xs:dateTime
347else (current-dateTime())
348},
349element UpdatedBy {$input_repository}
350}
351else ()
352}
353else ()
354} (: </dgMetadataRecord>:),
355
356(: Had to add some stuff here so originating data centre stuff is picked up
357for $bum in distinct-values($DIF/(dif:Data_Set_Citation/dif:Dataset_Creator | dif:Originating_Center | dif:Data_Center))
358return
359    element snooze {data('arse')},
360
361:)
362
363if (count(distinct-values($DIF/(dif:Data_Set_Citation/dif:Dataset_Creator | dif:Originating_Center ))) !=0) then
364    for $creator in distinct-values($DIF/(dif:Data_Set_Citation/dif:Dataset_Creator | dif:Originating_Center ))
365    return
366        element dgOrganisation{
367            element dgMetadataID {
368                element schemeIdentifier {'NDG-B0'},
369                element repositoryIdentifier {$input_repository},
370                if ($output_local_id != 'Output_LocalID') then
371                    element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creator), '-', $output_local_id))}
372                else
373                    element localIdentifier  {encode-for-uri(concat('generated_orgcit-', string($creator), '-', data($DIF/dif:Entry_ID)))}
374                },
375                element name {string($creator)},
376                element abbreviation {string($creator)},
377                element contactDetails {''}
378        }
379 else
380     element dgOrganisation{
381         element dgMetadataID {
382                element schemeIdentifier {'NDG-B0'},
383                element repositoryIdentifier {$input_repository},               
384               if ($output_local_id != 'Output_LocalID') then
385                   element localIdentifier {concat('generated_desc-', $output_local_id)}
386               else
387                   element localIdentifier {concat('generated_desc-', encode-for-uri(string($DIF/dif:Entry_ID)))}
388               },
389                element name {data($DIF/dif:Data_Center/dif:Data_Center_Name/dif:Long_Name)},
390                element abbreviation {data($DIF/dif:Data_Center/dif:Data_Center_Name/dif:Short_Name)},
391                element contactDetails {''}
392     }
393 
394} (:    </dgMetadata> :)
395
Note: See TracBrowser for help on using the repository browser.