From 2129e9caa7a6f22a0b7b65f446008089e88d6b78 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 28 Apr 2021 10:21:03 +0200 Subject: [PATCH] updated pangaea transformation to parse directly the xml --- .../eu/dnetlib/sx/pangaea/PangaeaUtils.scala | 67 ++++++++++---- .../SparkGeneratePanagaeaDataset.scala | 2 +- .../dhp/sx/pangaea/PangaeaTransformTest.scala | 10 +- .../eu/dnetlib/dhp/sx/pangaea/input.xml | 91 +++++++++++++++++++ 4 files changed, 147 insertions(+), 23 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala index f62267e94..c57b1f7a9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala @@ -1,6 +1,5 @@ package eu.dnetlib.sx.pangaea - import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.{Encoder, Encoders} import org.json4s @@ -9,11 +8,13 @@ import org.json4s.jackson.JsonMethods.parse import java.text.SimpleDateFormat import java.util.Date +import java.util.regex.Pattern +import scala.language.postfixOps +import scala.xml.{Elem, Node, XML} - -case class PangaeaDataModel(datestamp:String, identifier:String, xml:String) {} - - +case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String], + publisher:List[String], dataCenter :List[String],subject :List[String], language:String, + rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {} object PangaeaUtils { @@ -21,14 +22,46 @@ object PangaeaUtils { def toDataset(input:String):PangaeaDataModel = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: json4s.JValue = parse(input) - - val d = new Date() - val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z" - - val ds = (json \ "internal-datestamp").extractOrElse[String](s) - val identifier= (json \ "metadatalink").extractOrElse[String]("") val xml= (json \ "xml").extract[String] - PangaeaDataModel(ds, identifier,xml) + parseXml(xml) + } + + def findDOIInRelation( input:List[String]):List[String] = { + val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b") + input.map(i => { + val matcher = pattern.matcher(i) + if (matcher.find()) + matcher.group(0) + else + null + }).filter(i => i!= null) + } + + def attributeOpt(attribute: String, node:Node): Option[String] = + node.attribute(attribute) flatMap (_.headOption) map (_.text) + + def extractLinkage(node:Elem):List[(String, String)] = { + (node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut) + } + + def parseXml(input:String):PangaeaDataModel = { + val xml = XML.loadString(input) + + val identifier = (xml \ "identifier").text + val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut) + val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut) + val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut) + val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut) + val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut) + val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut) + val language= (xml \ "language").text + val rights= (xml \ "rights").text + val parentIdentifier= (xml \ "parentIdentifier").text + val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut) + val relationFiltered = findDOIInRelation(relation) + val linkage:List[(String,String)] = extractLinkage(xml) + + PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage) } @@ -44,11 +77,9 @@ object PangaeaUtils { if (a == null) b else { - val ts1 = b.datestamp - val ts2 = a._2.datestamp - if (ts1 > ts2) + if (b.title != null && b.title.nonEmpty) b - else + else a._2 } @@ -62,9 +93,7 @@ object PangaeaUtils { if (b2 == null) b1 else { - val ts1 = b1.datestamp - val ts2 = b2.datestamp - if (ts1 > ts2) + if (b1.title != null && b1.title.nonEmpty) b1 else b2 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala index 17b286a7e..88e5f2142 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala @@ -42,7 +42,7 @@ object SparkGeneratePanagaeaDataset { .groupByKey(_._1)(Encoders.STRING) .agg(PangaeaUtils.getDatasetAggregator().toColumn) .map(s => s._2) - .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset_updated") + .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset") } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala index 55eb4ee98..053e4d63e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala @@ -1,9 +1,12 @@ package eu.dnetlib.dhp.sx.pangaea +import eu.dnetlib.sx.pangaea.PangaeaUtils import org.junit.jupiter.api.Test + import java.util.TimeZone import java.text.SimpleDateFormat import java.util.Date +import scala.io.Source class PangaeaTransformTest { @@ -15,11 +18,12 @@ class PangaeaTransformTest { val d = new Date() - val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z" - - + val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format d}Z" println(s) + + val xml = Source.fromInputStream(getClass.getResourceAsStream("input.xml")).mkString + println(PangaeaUtils.parseXml(xml)) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml new file mode 100644 index 000000000..8818f316f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml @@ -0,0 +1,91 @@ + + The inorganic carbon system measured and calculated on Cibicidoides cf. wuellerstorfi of sediment core PS75/100-1 + Kersten, Franziska + Tiedemann, Ralf + Fietzke, Jan + Frische, Matthias + Tiedemann, Ralf + PANGAEA + PANGAEA: Data Publisher for Earth & Environmental Science + 2013-10-21 + Dataset + text/tab-separated-values, 8 data points + https://doi.org/10.1594/PANGAEA.820647 + https://doi.org/10.1594/PANGAEA.821013 + en + CC-BY-3.0: Creative Commons Attribution 3.0 Unported + Kersten, Franziska (2013): Last Glacial to Holocene changes of deep and intermediate water carbonate ion concentrations in the Southern Ocean: constraints from foraminiferal Boron/Calcium ratios. PhD Thesis, Alfred Wegener Institute, Helmholtz Centre for Polar and Marine Research, Bremerhaven + Kersten, Franziska; Tiedemann, Ralf; Fietzke, Jan; Frische, Matthias (2013): The B/Ca proxy for past seawater carbonate chemistry reconstructions-laser ablation based calibrations for C. mundulus, C. wuellerstorfi and its morphotype C. cf. wuellerstorfi. Climate of the Past Discussions, 9(4), 4425-4448, https://doi.org/10.5194/cpd-9-4425-2013 + Kersten, Franziska (2013): CO2sys Input variables estimated from nearby GLODAP sites. hdl:10013/epic.42543.d001 + DEPTH, sediment/rock + Cibicidoides cf. wuellerstorfi, Boron/Calcium ratio + Cibicidoides cf. wuellerstorfi, Boron/Calcium standard deviation + Δ carbonate ion content + Carbonate ion + LA-ICP-MS, Laser-ablation inductively coupled plasma mass spectrometer + Calculated + AWI_Paleo: Paleoenvironmental Reconstructions from Marine Sediments @ AWI + Polarstern + MultiCorer + MUC + PS75/100-1 + ANT-XXVI/2 + PS75 BIPOMAC + citable + deNBIchemical + author20400 + author32978 + author49036 + author49445 + basis1 + campaign33969 + event2584362 + geocode1 + geocode1599 + geocode1600 + geocode1601 + geocode8128 + inst32 + journal16751 + license101 + method10668 + method4872 + method50 + param131203 + param131204 + param7034 + param82364 + pi20400 + project1 + ref60902 + ref60959 + ref61047 + term1045260 + term1073131 + term19836 + term21005 + term2663825 + term33871 + term37764 + term38263 + term38520 + term41056 + term43863 + term44030 + topotype3 + + -45.75757 + 177.14887 + -45.75757 + 177.14887 + South Pacific Ocean + 0.0 m (DEPTH, sediment/rock) + 0.01 m (DEPTH, sediment/rock) + 2010-01-22 + 2010-01-22 + + https://doi.pangaea.de/10.1594/PANGAEA.820647 + https://doi.pangaea.de/10.1594/PANGAEA.820647?format=textfile + 7x(14-22) + Reconstruction equation: B/Ca = 2.27(D[CO32-]) + 152.5 (R2= 0.76) + \ No newline at end of file