defined a single oozie workflow for the generation of doiboost

2021-01-04 17:01:35 +01:00 · 2021-01-04 17:01:35 +01:00 · b0dc92786f
parent 7185158942
commit b0dc92786f
4 changed files with 372 additions and 2 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala
@ -39,6 +39,7 @@ object SparkGenerateDOIBoostActionSet {
    val dbaffiliationRelationPath   = parser.get("dbaffiliationRelationPath")
    val dbOrganizationPath          = parser.get("dbOrganizationPath")
    val workingDirPath              = parser.get("targetPath")
    val sequenceFilePath            = parser.get("sFilePath")
    spark.read.load(dbDatasetPath).as[OafDataset]
      .map(d =>DoiBoostMappingUtil.fixResult(d))
@ -65,7 +66,7 @@ object SparkGenerateDOIBoostActionSet {
    val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
-    d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
+    d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json
@ -5,5 +5,6 @@
  {"paramName": "cr",   "paramLongName":"crossRefRelation",                 "paramDescription": "the UnpayWall Publication Path",  "paramRequired": true},
  {"paramName": "da",   "paramLongName":"dbaffiliationRelationPath",        "paramDescription": "the MAG Publication Path",        "paramRequired": true},
  {"paramName": "do",   "paramLongName":"dbOrganizationPath",               "paramDescription": "the MAG Publication Path",        "paramRequired": true},
-  {"paramName": "w",    "paramLongName":"targetPath",                       "paramDescription": "the Working Path",                "paramRequired": true}
+  {"paramName": "w",    "paramLongName":"targetPath",                       "paramDescription": "the Working Path",                "paramRequired": true},
  {"paramName": "sp",    "paramLongName":"sFilePath",                       "paramDescription": "the Sequence file Path",          "paramRequired": true},
 ]
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
@ -0,0 +1,42 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -0,0 +1,326 @@
 <workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorIntersectionMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <!-- Itersection Parameters -->
        <property>
            <name>workingPath</name>
            <description>the working Path</description>
        </property>
        <property>
            <name>hostedByMapPath</name>
            <description>the hostedByMap Path</description>
        </property>
        <property>
            <name>outputPath</name>
            <description>the Path of the sequence file action set</description>
        </property>
        <!-- Crossref Parameters -->
        <property>
            <name>inputPathCrossref</name>
            <description>the Crossref input path</description>
        </property>
        <property>
            <name>crossrefTimestamp</name>
            <description>Timestamp for the Crossref incremental Harvesting</description>
        </property>
        <!--    MAG Parameters    -->
        <property>
            <name>inputPathMAG</name>
            <description>the MAG working path</description>
        </property>
        <!--    UnpayWall Parameters    -->
        <property>
            <name>inputPathUnpayWall</name>
            <description>the UnpayWall working path</description>
        </property>
        <!--    ORCID Parameters    -->
        <property>
            <name>inputPathOrcid</name>
            <description>the ORCID working path</description>
        </property>
    </parameters>
    <start to="resume_from"/>
    <decision name="resume_from">
        <switch>
            <case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
            <case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
            <case to="PreprocessMag">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
            <case to="PreprocessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
            <case to="PreprocessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
            <case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
            <case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
            <default to="ImportCrossRef"/>
        </switch>
    </decision>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="ImportCrossRef">
        <java>
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
            <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
            <arg>-t</arg><arg>${inputPathCrossref}/index_update</arg>
            <arg>-n</arg><arg>${nameNode}</arg>
            <arg>-ts</arg><arg>${timestamp}</arg>
        </java>
        <ok to="GenerateCrossrefDataset"/>
        <error to="Kill"/>
    </action>
    <!-- CROSSREF SECTION -->
    <action name="GenerateCrossrefDataset">
            <spark xmlns="uri:oozie:spark-action:0.2">
                <master>yarn-cluster</master>
                <mode>cluster</mode>
                <name>GenerateCrossrefDataset</name>
                <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
                <jar>dhp-doiboost-${projectVersion}.jar</jar>
                <spark-opts>
                    --executor-memory=${sparkExecutorMemory}
                    --executor-cores=${sparkExecutorCores}
                    --driver-memory=${sparkDriverMemory}
                    --conf spark.sql.shuffle.partitions=3840
                    ${sparkExtraOPT}
                </spark-opts>
                <arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
                <arg>--master</arg><arg>yarn-cluster</arg>
            </spark>
            <ok to="RenameDataset"/>
            <error to="Kill"/>
    </action>
    <action name="RenameDataset">
        <fs>
            <delete path="${inputPathCrossref}/crossref_ds"/>
            <move source="${inputPathCrossref}/crossref_ds_updated"
                  target="${inputPathCrossref}/crossref_ds"/>
        </fs>
        <ok to="ConvertCrossrefToOAF"/>
        <error to="Kill"/>
    </action>
    <action name="ConvertCrossrefToOAF">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>ConvertCrossrefToOAF</name>
            <class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="ResetMagWorkingPath"/>
        <error to="Kill"/>
    </action>
    <!-- MAG SECTION -->
    <action name="ResetMagWorkingPath">
        <fs>
            <delete path='${inputPathMAG}/dataset'/>
            <delete path='${inputPathMAG}/process'/>
            <delete path='${inputPathMAG}/dataset'/>
        </fs>
        <ok to="ConvertMagToDataset"/>
        <error to="Kill"/>
    </action>
    <action name="ConvertMagToDataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert Mag to Dataset</name>
            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathMAG}/input</arg>
            <arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="PreprocessMag"/>
        <error to="Kill"/>
    </action>
    <action name="PreprocessMag">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert Mag to OAF Dataset</name>
            <class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
            <arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="PreprocessUW"/>
        <error to="Kill"/>
    </action>
    <!--  UnpayWall  SECTION -->
    <action name="PreprocessUW">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert UnpayWall to Dataset</name>
            <class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="PreprocessORCID"/>
        <error to="Kill"/>
    </action>
    <!--  ORCID  SECTION -->
    <action name="PreprocessORCID">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Convert ORCID to Dataset</name>
            <class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
            <arg>--targetPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="CreateDOIBoost"/>
        <error to="Kill"/>
    </action>
 <!--    INTERSECTION SECTION-->
    <action name="CreateDOIBoost">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create DOIBoost Infospace</name>
            <class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
            <arg>--affiliationPath</arg><arg>${inputPathMAG}/process/Affiliations</arg>
            <arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/process/PaperAuthorAffiliations</arg>
            <arg>--workingDirPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="GenerateActionSet"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateActionSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Generate DOIBoost ActionSet</name>
            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
            <jar>dhp-doiboost-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorIntersectionMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=3840
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
            <arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
            <arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
            <arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
            <arg>-do</arg><arg>${workingPath}/doiBoostOrganization</arg>
            <arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
            <arg>--sFilePath</arg><arg>${outputPath}</arg>
            <arg>--master</arg><arg>yarn-cluster</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>