diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
index 72a220ef1..f1195a16f 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_updates_download/oozie_app/workflow.xml
@@ -31,7 +31,7 @@
sparkExecutorMemory
- 2G
+ 4G
memory for individual executor
@@ -190,6 +190,14 @@
-odownloads/updated_works
-t${token}
+
+
+
+
+
+
+
+
@@ -217,6 +225,14 @@
-o-
-t-
+
+
+
+
+
+
+
+
@@ -270,15 +286,6 @@
${workingPath}/orcid_dataset/new_works/*
${workingPath}/orcid_dataset/works
-
-
-
-
-
-
-
-
-
@@ -309,5 +316,5 @@
-
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala
index f62267e94..c57b1f7a9 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/PangaeaUtils.scala
@@ -1,6 +1,5 @@
package eu.dnetlib.sx.pangaea
-
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
@@ -9,11 +8,13 @@ import org.json4s.jackson.JsonMethods.parse
import java.text.SimpleDateFormat
import java.util.Date
+import java.util.regex.Pattern
+import scala.language.postfixOps
+import scala.xml.{Elem, Node, XML}
-
-case class PangaeaDataModel(datestamp:String, identifier:String, xml:String) {}
-
-
+case class PangaeaDataModel(identifier:String, title:List[String], objectType:List[String], creator:List[String],
+ publisher:List[String], dataCenter :List[String],subject :List[String], language:String,
+ rights:String, parent:String,relation :List[String],linkage:List[(String,String)] ) {}
object PangaeaUtils {
@@ -21,14 +22,46 @@ object PangaeaUtils {
def toDataset(input:String):PangaeaDataModel = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
-
- val d = new Date()
- val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
-
- val ds = (json \ "internal-datestamp").extractOrElse[String](s)
- val identifier= (json \ "metadatalink").extractOrElse[String]("")
val xml= (json \ "xml").extract[String]
- PangaeaDataModel(ds, identifier,xml)
+ parseXml(xml)
+ }
+
+ def findDOIInRelation( input:List[String]):List[String] = {
+ val pattern = Pattern.compile("\\b(10[.][0-9]{4,}(?:[.][0-9]+)*\\/(?:(?![\"&\\'<>])\\S)+)\\b")
+ input.map(i => {
+ val matcher = pattern.matcher(i)
+ if (matcher.find())
+ matcher.group(0)
+ else
+ null
+ }).filter(i => i!= null)
+ }
+
+ def attributeOpt(attribute: String, node:Node): Option[String] =
+ node.attribute(attribute) flatMap (_.headOption) map (_.text)
+
+ def extractLinkage(node:Elem):List[(String, String)] = {
+ (node \ "linkage").map(n =>(attributeOpt("type",n), n.text)).filter(t => t._1.isDefined).map(t=> (t._1.get, t._2))(collection.breakOut)
+ }
+
+ def parseXml(input:String):PangaeaDataModel = {
+ val xml = XML.loadString(input)
+
+ val identifier = (xml \ "identifier").text
+ val title :List[String] = (xml \ "title").map(n => n.text)(collection.breakOut)
+ val pType :List[String] = (xml \ "type").map(n => n.text)(collection.breakOut)
+ val creators:List[String] = (xml \ "creator").map(n => n.text)(collection.breakOut)
+ val publisher :List[String] = (xml \ "publisher").map(n => n.text)(collection.breakOut)
+ val dataCenter :List[String] = (xml \ "dataCenter").map(n => n.text)(collection.breakOut)
+ val subject :List[String] = (xml \ "subject").map(n => n.text)(collection.breakOut)
+ val language= (xml \ "language").text
+ val rights= (xml \ "rights").text
+ val parentIdentifier= (xml \ "parentIdentifier").text
+ val relation :List[String] = (xml \ "relation").map(n => n.text)(collection.breakOut)
+ val relationFiltered = findDOIInRelation(relation)
+ val linkage:List[(String,String)] = extractLinkage(xml)
+
+ PangaeaDataModel(identifier,title, pType, creators,publisher, dataCenter, subject, language, rights, parentIdentifier, relationFiltered, linkage)
}
@@ -44,11 +77,9 @@ object PangaeaUtils {
if (a == null)
b
else {
- val ts1 = b.datestamp
- val ts2 = a._2.datestamp
- if (ts1 > ts2)
+ if (b.title != null && b.title.nonEmpty)
b
- else
+ else
a._2
}
@@ -62,9 +93,7 @@ object PangaeaUtils {
if (b2 == null)
b1
else {
- val ts1 = b1.datestamp
- val ts2 = b2.datestamp
- if (ts1 > ts2)
+ if (b1.title != null && b1.title.nonEmpty)
b1
else
b2
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala
index 17b286a7e..88e5f2142 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/sx/pangaea/SparkGeneratePanagaeaDataset.scala
@@ -42,7 +42,7 @@ object SparkGeneratePanagaeaDataset {
.groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2)
- .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset_updated")
+ .write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala
index 55eb4ee98..053e4d63e 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/pangaea/PangaeaTransformTest.scala
@@ -1,9 +1,12 @@
package eu.dnetlib.dhp.sx.pangaea
+import eu.dnetlib.sx.pangaea.PangaeaUtils
import org.junit.jupiter.api.Test
+
import java.util.TimeZone
import java.text.SimpleDateFormat
import java.util.Date
+import scala.io.Source
class PangaeaTransformTest {
@@ -15,11 +18,12 @@ class PangaeaTransformTest {
val d = new Date()
- val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format(d)}Z"
-
-
+ val s:String = s"${new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS")format d}Z"
println(s)
+
+ val xml = Source.fromInputStream(getClass.getResourceAsStream("input.xml")).mkString
+ println(PangaeaUtils.parseXml(xml))
}
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml
new file mode 100644
index 000000000..8818f316f
--- /dev/null
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/pangaea/input.xml
@@ -0,0 +1,91 @@
+
+ The inorganic carbon system measured and calculated on Cibicidoides cf. wuellerstorfi of sediment core PS75/100-1
+ Kersten, Franziska
+ Tiedemann, Ralf
+ Fietzke, Jan
+ Frische, Matthias
+ Tiedemann, Ralf
+ PANGAEA
+ PANGAEA: Data Publisher for Earth & Environmental Science
+ 2013-10-21
+ Dataset
+ text/tab-separated-values, 8 data points
+ https://doi.org/10.1594/PANGAEA.820647
+ https://doi.org/10.1594/PANGAEA.821013
+ en
+ CC-BY-3.0: Creative Commons Attribution 3.0 Unported
+ Kersten, Franziska (2013): Last Glacial to Holocene changes of deep and intermediate water carbonate ion concentrations in the Southern Ocean: constraints from foraminiferal Boron/Calcium ratios. PhD Thesis, Alfred Wegener Institute, Helmholtz Centre for Polar and Marine Research, Bremerhaven
+ Kersten, Franziska; Tiedemann, Ralf; Fietzke, Jan; Frische, Matthias (2013): The B/Ca proxy for past seawater carbonate chemistry reconstructions-laser ablation based calibrations for C. mundulus, C. wuellerstorfi and its morphotype C. cf. wuellerstorfi. Climate of the Past Discussions, 9(4), 4425-4448, https://doi.org/10.5194/cpd-9-4425-2013
+ Kersten, Franziska (2013): CO2sys Input variables estimated from nearby GLODAP sites. hdl:10013/epic.42543.d001
+ DEPTH, sediment/rock
+ Cibicidoides cf. wuellerstorfi, Boron/Calcium ratio
+ Cibicidoides cf. wuellerstorfi, Boron/Calcium standard deviation
+ Δ carbonate ion content
+ Carbonate ion
+ LA-ICP-MS, Laser-ablation inductively coupled plasma mass spectrometer
+ Calculated
+ AWI_Paleo: Paleoenvironmental Reconstructions from Marine Sediments @ AWI
+ Polarstern
+ MultiCorer
+ MUC
+ PS75/100-1
+ ANT-XXVI/2
+ PS75 BIPOMAC
+ citable
+ deNBIchemical
+ author20400
+ author32978
+ author49036
+ author49445
+ basis1
+ campaign33969
+ event2584362
+ geocode1
+ geocode1599
+ geocode1600
+ geocode1601
+ geocode8128
+ inst32
+ journal16751
+ license101
+ method10668
+ method4872
+ method50
+ param131203
+ param131204
+ param7034
+ param82364
+ pi20400
+ project1
+ ref60902
+ ref60959
+ ref61047
+ term1045260
+ term1073131
+ term19836
+ term21005
+ term2663825
+ term33871
+ term37764
+ term38263
+ term38520
+ term41056
+ term43863
+ term44030
+ topotype3
+
+ -45.75757
+ 177.14887
+ -45.75757
+ 177.14887
+ South Pacific Ocean
+ 0.0 m (DEPTH, sediment/rock)
+ 0.01 m (DEPTH, sediment/rock)
+ 2010-01-22
+ 2010-01-22
+
+ https://doi.pangaea.de/10.1594/PANGAEA.820647
+ https://doi.pangaea.de/10.1594/PANGAEA.820647?format=textfile
+ 7x(14-22)
+ Reconstruction equation: B/Ca = 2.27(D[CO32-]) + 152.5 (R2= 0.76)
+
\ No newline at end of file