diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
index 096217a552..579ce8d429 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@@ -14,7 +14,7 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
-case class CrossrefDT(doi: String, json:String) {}
+case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
index fac4c90b47..08319058c7 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala
@@ -29,69 +29,90 @@ object SparkMapDumpIntoOAF {
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
+ implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
val sc = spark.sparkContext
val targetPath = parser.get("targetPath")
+ import spark.implicits._
- sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
- .map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
- .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
-
- val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
-
- val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
- .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
- var r = if (p1 == null) p2 else p1
- if (p1 != null && p2 != null) {
- if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
- if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
- r = p2
- else
- r = p1
- } else {
- r = if (p1.getLastupdatetimestamp == null) p2 else p1
- }
- }
- r
- }.map(_._2)
-
- val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
- pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
+ spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
+ .flatMap(k => Crossref2Oaf.convert(k.json))
+ .filter(o => o != null)
+ .write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
- val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
- .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
- var r = if (p1 == null) p2 else p1
- if (p1 != null && p2 != null) {
- if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
- if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
- r = p2
- else
- r = p1
- } else {
- r = if (p1.getLastupdatetimestamp == null) p2 else p1
- }
- }
- r
- }.map(_._2)
+ val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
- spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
+ ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.save(s"$targetPath/publication")
+
+ ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.save(s"$targetPath/relation")
+
+ ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.save(s"$targetPath/dataset")
- val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
- .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
- .reduceByKey { case (p1: Relation, p2: Relation) =>
- if (p1 == null) p2 else p1
- }.map(_._2)
-
- val rels: Dataset[Relation] = spark.createDataset(distinctRels)
-
- rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
+//
+//
+//
+// sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
+// .map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
+// .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
+//
+// val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
+//
+// val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
+// .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
+// var r = if (p1 == null) p2 else p1
+// if (p1 != null && p2 != null) {
+// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
+// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
+// r = p2
+// else
+// r = p1
+// } else {
+// r = if (p1.getLastupdatetimestamp == null) p2 else p1
+// }
+// }
+// r
+// }.map(_._2)
+//
+// val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
+// pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
+//
+//
+// val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
+// .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
+// var r = if (p1 == null) p2 else p1
+// if (p1 != null && p2 != null) {
+// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
+// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
+// r = p2
+// else
+// r = p1
+// } else {
+// r = if (p1.getLastupdatetimestamp == null) p2 else p1
+// }
+// }
+// r
+// }.map(_._2)
+//
+// spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
+//
+//
+//
+// val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
+// .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
+// .reduceByKey { case (p1: Relation, p2: Relation) =>
+// if (p1 == null) p2 else p1
+// }.map(_._2)
+//
+// val rels: Dataset[Relation] = spark.createDataset(distinctRels)
+//
+// rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml
index be4a45afe5..a9cc9ea3cd 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/crossref/oozie_app/workflow.xml
@@ -16,10 +16,10 @@
sparkExecutorCores
number of cores used by single executor
-
- timestamp
- Timestamp for incremental Harvesting
-
+
+
+
+
@@ -30,29 +30,29 @@
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
- ${jobTracker}
- ${nameNode}
- eu.dnetlib.doiboost.crossref.CrossrefImporter
- -t${workingPath}/input/crossref/index_dump_1
- -n${nameNode}
- -ts${timestamp}
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
@@ -68,7 +68,7 @@
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
- --sourcePath${workingPath}/input/crossref/index_dump,${workingPath}/input/crossref/index_dump_1,${workingPath}/crossref/index_dump
+ --sourcePath${workingPath}/input/crossref/crossref_ds
--targetPath${workingPath}/input/crossref
--masteryarn-cluster
@@ -78,26 +78,26 @@
-
-
- yarn-cluster
- cluster
- ExtractCrossrefToOAF
- eu.dnetlib.doiboost.crossref.CrossrefDataset
- dhp-doiboost-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- ${sparkExtraOPT}
-
- --sourcePath/data/doiboost/crossref/cr_dataset
- --targetPath/data/doiboost/crossref/crossrefDataset
- --masteryarn-cluster
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file