updated pangaea transformation to parse directly the xml

This commit is contained in:
Sandro La Bruzzo 2021-04-28 10:21:03 +02:00
parent 5afa7d3e0c
commit 2129e9caa7
4 changed files with 147 additions and 23 deletions

View File

@ -1,6 +1,5 @@
package eu.dnetlib.sx.pangaea
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
@ -9,11 +8,13 @@ import org.json4s.jackson.JsonMethods.parse
import java.text.SimpleDateFormat
import java.util.Date
import java.util.regex.Pattern
import scala.language.postfixOps
import scala.xml.{Elem, Node, XML}
case class PangaeaDataModel(datestamp:String, identifier:String, xml:String) {}
case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
object PangaeaUtils {
@ -21,14 +22,46 @@ object PangaeaUtils {
def toDataset(input:String):PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val d = new Date()
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
val ds = (json \ "internal-datestamp").extractOrElse[String](s)
val identifier= (json \ "metadatalink").extractOrElse[String]("")
val xml= (json \ "xml").extract[String]
PangaeaDataModel(ds, identifier,xml)
parseXml(xml)
}
def findDOIInRelation( input:List[String]):List[String] = {
val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
input.map(i => {
val matcher = pattern.matcher(i)
if (matcher.find())
matcher.group(0)
else
null
}).filter(i => i!= null)
}
def attributeOpt(attribute: String, node:Node): Option[String] =
node.attribute(attribute) flatMap (_.headOption) map (_.text)
def extractLinkage(node:Elem):List[(String, String)] = {
(node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
}
def parseXml(input:String):PangaeaDataModel = {
val xml = XML.loadString(input)
val identifier = (xml \ "identifier").text
val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
val language= (xml \ "language").text
val rights= (xml \ "rights").text
val parentIdentifier= (xml \ "parentIdentifier").text
val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
val relationFiltered = findDOIInRelation(relation)
val linkage:List[(String,String)] = extractLinkage(xml)
PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
}
@ -44,11 +77,9 @@ object PangaeaUtils {
if (a == null)
b
else {
val ts1 = b.datestamp
val ts2 = a._2.datestamp
if (ts1 > ts2)
if (b.title != null && b.title.nonEmpty)
b
else
else
a._2
}
@ -62,9 +93,7 @@ object PangaeaUtils {
if (b2 == null)
b1
else {
val ts1 = b1.datestamp
val ts2 = b2.datestamp
if (ts1 > ts2)
if (b1.title != null && b1.title.nonEmpty)
b1
else
b2

View File

@ -42,7 +42,7 @@ object SparkGeneratePanagaeaDataset {
.groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset_updated")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
}

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.sx.pangaea
import eu.dnetlib.sx.pangaea.PangaeaUtils
import org.junit.jupiter.api.Test
import java.util.TimeZone
import java.text.SimpleDateFormat
import java.util.Date
import scala.io.Source
class PangaeaTransformTest {
@ -15,11 +18,12 @@ class PangaeaTransformTest {
val d = new Date()
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format d}Z"
println(s)
val xml = Source.fromInputStream(getClass.getResourceAsStream("input.xml")).mkString
println(PangaeaUtils.parseXml(xml))
}
}

View File

@ -0,0 +1,91 @@
<dataset xmlns="urn:pangaea.de:dataportals" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dc:title>The inorganic carbon system measured and calculated on Cibicidoides cf. wuellerstorfi of sediment core PS75/100-1</dc:title>
<dc:creator>Kersten, Franziska</dc:creator>
<dc:creator>Tiedemann, Ralf</dc:creator>
<dc:creator>Fietzke, Jan</dc:creator>
<dc:creator>Frische, Matthias</dc:creator>
<principalInvestigator>Tiedemann, Ralf</principalInvestigator>
<dc:publisher>PANGAEA</dc:publisher>
<dataCenter>PANGAEA: Data Publisher for Earth &amp; Environmental Science</dataCenter>
<dc:date>2013-10-21</dc:date>
<dc:type>Dataset</dc:type>
<dc:format>text/tab-separated-values, 8 data points</dc:format>
<dc:identifier>https://doi.org/10.1594/PANGAEA.820647</dc:identifier>
<parentIdentifier>https://doi.org/10.1594/PANGAEA.821013</parentIdentifier>
<dc:language>en</dc:language>
<dc:rights>CC-BY-3.0: Creative Commons Attribution 3.0 Unported</dc:rights>
<dc:relation>Kersten, Franziska (2013): Last Glacial to Holocene changes of deep and intermediate water carbonate ion concentrations in the Southern Ocean: constraints from foraminiferal Boron/Calcium ratios. PhD Thesis, Alfred Wegener Institute, Helmholtz Centre for Polar and Marine Research, Bremerhaven</dc:relation>
<dc:relation>Kersten, Franziska; Tiedemann, Ralf; Fietzke, Jan; Frische, Matthias (2013): The B/Ca proxy for past seawater carbonate chemistry reconstructions-laser ablation based calibrations for C. mundulus, C. wuellerstorfi and its morphotype C. cf. wuellerstorfi. Climate of the Past Discussions, 9(4), 4425-4448, https://doi.org/10.5194/cpd-9-4425-2013</dc:relation>
<dc:relation>Kersten, Franziska (2013): CO2sys Input variables estimated from nearby GLODAP sites. hdl:10013/epic.42543.d001</dc:relation>
<dc:subject type="parameter" xsi:type="SubjectType">DEPTH, sediment/rock</dc:subject>
<dc:subject type="parameter" xsi:type="SubjectType">Cibicidoides cf. wuellerstorfi, Boron/Calcium ratio</dc:subject>
<dc:subject type="parameter" xsi:type="SubjectType">Cibicidoides cf. wuellerstorfi, Boron/Calcium standard deviation</dc:subject>
<dc:subject type="parameter" xsi:type="SubjectType">Δ carbonate ion content</dc:subject>
<dc:subject type="parameter" xsi:type="SubjectType">Carbonate ion</dc:subject>
<dc:subject type="method" xsi:type="SubjectType">LA-ICP-MS, Laser-ablation inductively coupled plasma mass spectrometer</dc:subject>
<dc:subject type="method" xsi:type="SubjectType">Calculated</dc:subject>
<dc:subject type="project" xsi:type="SubjectType">AWI_Paleo: Paleoenvironmental Reconstructions from Marine Sediments @ AWI</dc:subject>
<dc:subject type="platform" xsi:type="SubjectType">Polarstern</dc:subject>
<dc:subject type="sensor" xsi:type="SubjectType">MultiCorer</dc:subject>
<dc:subject type="sensor" xsi:type="SubjectType">MUC</dc:subject>
<dc:subject type="feature" xsi:type="SubjectType">PS75/100-1</dc:subject>
<dc:subject type="feature" xsi:type="SubjectType">ANT-XXVI/2</dc:subject>
<dc:subject type="feature" xsi:type="SubjectType">PS75 BIPOMAC</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">citable</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">deNBIchemical</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">author20400</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">author32978</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">author49036</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">author49445</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">basis1</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">campaign33969</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">event2584362</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">geocode1</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">geocode1599</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">geocode1600</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">geocode1601</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">geocode8128</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">inst32</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">journal16751</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">license101</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">method10668</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">method4872</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">method50</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">param131203</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">param131204</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">param7034</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">param82364</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">pi20400</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">project1</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">ref60902</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">ref60959</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">ref61047</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term1045260</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term1073131</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term19836</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term21005</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term2663825</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term33871</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term37764</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term38263</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term38520</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term41056</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term43863</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">term44030</dc:subject>
<dc:subject type="pangaea-tech-keyword" xsi:type="SubjectType">topotype3</dc:subject>
<dc:coverage xsi:type="CoverageType">
<northBoundLatitude>-45.75757</northBoundLatitude>
<westBoundLongitude>177.14887</westBoundLongitude>
<southBoundLatitude>-45.75757</southBoundLatitude>
<eastBoundLongitude>177.14887</eastBoundLongitude>
<location>South Pacific Ocean</location>
<minElevation>0.0 m (DEPTH, sediment/rock)</minElevation>
<maxElevation>0.01 m (DEPTH, sediment/rock)</maxElevation>
<startDate>2010-01-22</startDate>
<endDate>2010-01-22</endDate>
</dc:coverage>
<linkage type="metadata">https://doi.pangaea.de/10.1594/PANGAEA.820647</linkage>
<linkage type="data">https://doi.pangaea.de/10.1594/PANGAEA.820647?format=textfile</linkage>
<additionalContent>7x(14-22)</additionalContent>
<additionalContent>Reconstruction equation: B/Ca = 2.27(D[CO32-]) + 152.5 (R2= 0.76)</additionalContent>
</dataset>