diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
index 7a6cd3faa..da9da22b6 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@@ -39,6 +39,7 @@ object SparkGenerateDOIBoostActionSet {
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
val dbOrganizationPath = parser.get("dbOrganizationPath")
val workingDirPath = parser.get("targetPath")
+ val sequenceFilePath = parser.get("sFilePath")
spark.read.load(dbDatasetPath).as[OafDataset]
.map(d =>DoiBoostMappingUtil.fixResult(d))
@@ -65,7 +66,7 @@ object SparkGenerateDOIBoostActionSet {
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
- d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+ d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
index 6eb1ec6f1..0cf9955c9 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
@@ -5,5 +5,6 @@
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
- {"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
+ {"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true},
+ {"paramName": "sp", "paramLongName":"sFilePath", "paramDescription": "the Sequence file Path", "paramRequired": true},
]
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
new file mode 100644
index 000000000..508202e30
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
@@ -0,0 +1,42 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ hive_metastore_uris
+ thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+
+
+ spark2EventLogDir
+ /user/spark/spark2ApplicationHistory
+
+
+ spark2ExtraListeners
+ "com.cloudera.spark.lineage.NavigatorAppListener"
+
+
+ spark2SqlQueryExecutionListeners
+ "com.cloudera.spark.lineage.NavigatorQueryListener"
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
new file mode 100644
index 000000000..d77972512
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@@ -0,0 +1,326 @@
+
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorIntersectionMemory
+ memory for individual executor
+
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+ workingPath
+ the working Path
+
+
+
+ hostedByMapPath
+ the hostedByMap Path
+
+
+ outputPath
+ the Path of the sequence file action set
+
+
+
+
+
+
+ inputPathCrossref
+ the Crossref input path
+
+
+ crossrefTimestamp
+ Timestamp for the Crossref incremental Harvesting
+
+
+
+
+ inputPathMAG
+ the MAG working path
+
+
+
+
+
+ inputPathUnpayWall
+ the UnpayWall working path
+
+
+
+
+ inputPathOrcid
+ the ORCID working path
+
+
+
+
+
+
+
+
+
+
+
+ ${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}
+ ${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}
+ ${wf:conf('resumeFrom') eq 'PreprocessMag'}
+ ${wf:conf('resumeFrom') eq 'PreprocessUW'}
+ ${wf:conf('resumeFrom') eq 'PreprocessORCID'}
+ ${wf:conf('resumeFrom') eq 'CreateDOIBoost'}
+ ${wf:conf('resumeFrom') eq 'GenerateActionSet'}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.doiboost.crossref.CrossrefImporter
+ -t${inputPathCrossref}/index_update
+ -n${nameNode}
+ -ts${timestamp}
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ GenerateCrossrefDataset
+ eu.dnetlib.doiboost.crossref.CrossrefDataset
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --workingPath${inputPathCrossref}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ ConvertCrossrefToOAF
+ eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --sourcePath${inputPathCrossref}/crossref_ds
+ --targetPath${workingPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Convert Mag to Dataset
+ eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ ${sparkExtraOPT}
+
+ --sourcePath${inputPathMAG}/input
+ --targetPath${inputPathMAG}/dataset
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Convert Mag to OAF Dataset
+ eu.dnetlib.doiboost.mag.SparkPreProcessMAG
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --sourcePath${inputPathMAG}/dataset
+ --workingPath${inputPathMAG}/process
+ --targetPath${workingPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Convert UnpayWall to Dataset
+ eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --sourcePath${inputPathUnpayWall}/uw_extracted
+ --targetPath${workingPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Convert ORCID to Dataset
+ eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --sourcePath${inputPathOrcid}
+ --targetPath${workingPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Create DOIBoost Infospace
+ eu.dnetlib.doiboost.SparkGenerateDoiBoost
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --hostedByMapPath${hostedByMapPath}
+ --affiliationPath${inputPathMAG}/process/Affiliations
+ --paperAffiliationPath${inputPathMAG}/process/PaperAuthorAffiliations
+ --workingDirPath${workingPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
+
+ yarn-cluster
+ cluster
+ Generate DOIBoost ActionSet
+ eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet
+ dhp-doiboost-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorIntersectionMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.sql.shuffle.partitions=3840
+ ${sparkExtraOPT}
+
+ --dbPublicationPath${workingPath}/doiBoostPublicationFiltered
+ --dbDatasetPath${workingPath}/crossrefDataset
+ --crossRefRelation${workingPath}/crossrefRelation
+ --dbaffiliationRelationPath${workingPath}/doiBoostPublicationAffiliation
+ -do${workingPath}/doiBoostOrganization
+ --targetPath${workingPath}/actionDataSet
+ --sFilePath${outputPath}
+ --masteryarn-cluster
+
+
+
+
+
+
+
\ No newline at end of file