changed the workflow to extract info from the dump

This commit is contained in:
Miriam Baglioni 2021-06-15 09:22:54 +02:00
parent d6e21bb6ea
commit f7379255b6
3 changed files with 143 additions and 120 deletions

View File

@ -1,54 +0,0 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
import scala.io.Source
object UnpackCrossrefDumpEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrossrefDumpEntries.getClass)
def extractDump(input:String):List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val a = (json \ "items").extract[JArray]
a.arr.map(s => compact(render(s)))
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(UnpackCrossrefDumpEntries.getClass.getSimpleName)
.master(master)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.wholeTextFiles(sourcePath,6000).flatMap(d =>extractDump(d._2))
.saveAsTextFile(targetPath, classOf[GzipCodec]);
}
}

View File

@ -1,9 +1,13 @@
<workflow-app name="read Crossref dump from HDFS" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="read Crossref dump from HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<!-- <property>--> <property>
<!-- <name>workingPath</name>--> <name>crossrefDumpPath</name>
<!-- <description>the working dir base path</description>--> <description>the working dir base path</description>
<!-- </property>--> </property>
<property>
<name>inputPathCrossref</name>
<description>the working dir base path</description>
</property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkDriverMemory</name>
<description>memory for driver process</description> <description>memory for driver process</description>
@ -14,25 +18,27 @@
</property> </property>
<property> <property>
<name>sparkExecutorCores</name> <name>sparkExecutorCores</name>
<value>2</value>
<description>number of cores used by single executor</description> <description>number of cores used by single executor</description>
</property> </property>
</parameters> </parameters>
<start to="generateCrossrefDataset"/> <start to="ImportCrossRef"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ReadCrossRefDump"> <action name="ImportCrossRef">
<java> <java>
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class> <main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg> <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg> <arg>--crossrefFileNameTarGz</arg><arg>${crossrefDumpPath}/crossref.tar.gz</arg>
<arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg> <arg>--workingPath</arg><arg>${crossrefDumpPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/files/</arg>
</java> </java>
<ok to="generateCrossrefDataset"/> <ok to="generateCrossrefDataset"/>
<error to="Kill"/> <error to="Kill"/>
@ -42,24 +48,42 @@
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>SparkCreateCrossredDataset</name> <name>SparkGenerateCrossrefDataset</name>
<class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDatasetSpark</class> <class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar> <jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--conf spark.dynamicAllocation.enabled=true --executor-memory=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors=20 --executor-cores=${sparkExecutorCores}
--executor-memory=6G --driver-memory=${sparkDriverMemory}
--driver-memory=7G --conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts> </spark-opts>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>/data/doiboost/crossref/filess</arg> <arg>--sourcePath</arg><arg>${workingDir}/files</arg>
<arg>--targetPath</arg><arg>/tmp/miriam/crossref/crossrefDataset</arg> <arg>--targetPath</arg><arg>${inputPathCrossref}/crossref_ds_updated</arg>
</spark> </spark>
<ok to="removeFiles"/>
<error to="Kill"/>
</action>
<action name="removeFiles">
<fs>
<delete path="${workingDir}/files"/>
</fs>
<ok to="renameDataset"/>
<error to="Kill"/>
</action>
<action name="renameDataset">
<fs>
<delete path="${inputPathCrossref}/crossref_ds"/>
<move source="${inputPathCrossref}/crossref_ds_updated"
target="${inputPathCrossref}/crossref_ds"/>
</fs>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>

View File

@ -41,17 +41,21 @@
<description>the Crossref input path</description> <description>the Crossref input path</description>
</property> </property>
<property> <property>
<name>crossrefTimestamp</name> <name>crossrefDumpPath</name>
<description>Timestamp for the Crossref incremental Harvesting</description> <description>the Crossref dump path</description>
</property>
<property>
<name>esServer</name>
<description>elasticsearch server url for the Crossref Harvesting</description>
</property>
<property>
<name>esIndex</name>
<description>elasticsearch index name for the Crossref Harvesting</description>
</property> </property>
<!-- <property>-->
<!-- <name>crossrefTimestamp</name>-->
<!-- <description>Timestamp for the Crossref incremental Harvesting</description>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>esServer</name>-->
<!-- <description>elasticsearch server url for the Crossref Harvesting</description>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>esIndex</name>-->
<!-- <description>elasticsearch index name for the Crossref Harvesting</description>-->
<!-- </property>-->
<!-- MAG Parameters --> <!-- MAG Parameters -->
<property> <property>
@ -114,55 +118,104 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<!-- <action name="ImportCrossRef">-->
<!-- <java>-->
<!-- <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>-->
<!-- <arg>&#45;&#45;targetPath</arg><arg>${inputPathCrossref}/index_update</arg>-->
<!-- <arg>&#45;&#45;namenode</arg><arg>${nameNode}</arg>-->
<!-- <arg>&#45;&#45;esServer</arg><arg>${esServer}</arg>-->
<!-- <arg>&#45;&#45;esIndex</arg><arg>${esIndex}</arg>-->
<!-- <arg>&#45;&#45;timestamp</arg><arg>${crossrefTimestamp}</arg>-->
<!-- </java>-->
<!-- <ok to="GenerateCrossrefDataset"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="ImportCrossRef"> <action name="ImportCrossRef">
<java> <java>
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class> <job-tracker>${jobTracker}</job-tracker>
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg> <name-node>${nameNode}</name-node>
<arg>--namenode</arg><arg>${nameNode}</arg> <main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
<arg>--esServer</arg><arg>${esServer}</arg> <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--esIndex</arg><arg>${esIndex}</arg> <arg>--crossrefFileNameTarGz</arg><arg>${crossrefDumpPath}/crossref.tar.gz</arg>
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg> <arg>--workingPath</arg><arg>${crossrefDumpPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/files</arg>
</java> </java>
<ok to="GenerateCrossrefDataset"/> <ok to="generateCrossrefDataset"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="generateCrossrefDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>SparkGenerateCrossrefDataset</name>
<class>eu.dnetlib.doiboost.crossref.GenerateCrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDir}/files/</arg>
<arg>--targetPath</arg><arg>${inputPathCrossref}/crossref_ds_updated</arg>
</spark>
<ok to="removeFiles"/>
<error to="Kill"/>
</action>
<action name="removeFiles">
<fs>
<delete path="${workingDir}/files"/>
</fs>
<ok to="ResetMagWorkingPath"/>
<error to="Kill"/>
</action>
<!-- CROSSREF SECTION --> <!-- CROSSREF SECTION -->
<action name="GenerateCrossrefDataset"> <!-- <action name="GenerateCrossrefDataset">-->
<spark xmlns="uri:oozie:spark-action:0.2"> <!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<master>yarn-cluster</master> <!-- <master>yarn-cluster</master>-->
<mode>cluster</mode> <!-- <mode>cluster</mode>-->
<name>GenerateCrossrefDataset</name> <!-- <name>GenerateCrossrefDataset</name>-->
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class> <!-- <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>-->
<jar>dhp-doiboost-${projectVersion}.jar</jar> <!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
<spark-opts> <!-- <spark-opts>-->
--executor-memory=${sparkExecutorMemory} <!-- &#45;&#45;executor-memory=${sparkExecutorMemory}-->
--executor-cores=${sparkExecutorCores} <!-- &#45;&#45;executor-cores=${sparkExecutorCores}-->
--driver-memory=${sparkDriverMemory} <!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
--conf spark.sql.shuffle.partitions=3840 <!-- &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
--conf spark.extraListeners=${spark2ExtraListeners} <!-- &#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} <!-- &#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} <!-- &#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} <!-- &#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
</spark-opts> <!-- </spark-opts>-->
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg> <!-- <arg>&#45;&#45;workingPath</arg><arg>${inputPathCrossref}</arg>-->
<arg>--master</arg><arg>yarn-cluster</arg> <!-- <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
</spark> <!-- </spark>-->
<ok to="RenameDataset"/> <!-- <ok to="RenameDataset"/>-->
<error to="Kill"/> <!-- <error to="Kill"/>-->
</action> <!-- </action>-->
<action name="RenameDataset"> <!-- <action name="RenameDataset">-->
<fs> <!-- <fs>-->
<delete path="${inputPathCrossref}/crossref_ds"/> <!-- <delete path="${inputPathCrossref}/crossref_ds"/>-->
<move source="${inputPathCrossref}/crossref_ds_updated" <!-- <move source="${inputPathCrossref}/crossref_ds_updated"-->
target="${inputPathCrossref}/crossref_ds"/> <!-- target="${inputPathCrossref}/crossref_ds"/>-->
</fs> <!-- </fs>-->
<ok to="ResetMagWorkingPath"/> <!-- <ok to="ResetMagWorkingPath"/>-->
<error to="Kill"/> <!-- <error to="Kill"/>-->
</action> <!-- </action>-->