cleanup old doiboost workflows

2021-06-18 15:14:08 +02:00 · 2021-06-18 15:14:08 +02:00 · a3948c1f6e
parent fddbc8364e
commit a3948c1f6e
2 changed files with 0 additions and 414 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/config-default.xml
@ -1,42 +0,0 @@
-<configuration>
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/oozie_app/workflow.xml
@ -1,372 +0,0 @@
-<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorIntersectionMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-
-
-        <!-- Itersection Parameters -->
-        <property>
-            <name>workingPath</name>
-            <description>the working Path</description>
-        </property>
-
-        <property>
-            <name>hostedByMapPath</name>
-            <description>the hostedByMap Path</description>
-        </property>
-        <property>
-            <name>outputPath</name>
-            <description>the Path of the sequence file action set</description>
-        </property>
-
-
-        <!-- Crossref Parameters -->
-        <property>
-            <name>inputPathCrossref</name>
-            <description>the Crossref input path</description>
-        </property>
-        <property>
-            <name>crossrefTimestamp</name>
-            <description>Timestamp for the Crossref incremental Harvesting</description>
-        </property>
-        <property>
-            <name>esServer</name>
-            <description>elasticsearch server url for the Crossref Harvesting</description>
-        </property>
-        <property>
-            <name>esIndex</name>
-            <description>elasticsearch index name for the Crossref Harvesting</description>
-        </property>
-
-        <!--    MAG Parameters    -->
-        <property>
-            <name>MAGDumpPath</name>
-            <description>the MAG dump working path</description>
-        </property>
-
-        <property>
-            <name>inputPathMAG</name>
-            <description>the MAG working path</description>
-        </property>
-
-
-        <!--    UnpayWall Parameters    -->
-        <property>
-            <name>inputPathUnpayWall</name>
-            <description>the UnpayWall working path</description>
-        </property>
-
-        <!--    ORCID Parameters    -->
-        <property>
-            <name>inputPathOrcid</name>
-            <description>the ORCID input path</description>
-        </property>
-
-        <property>
-            <name>workingPathOrcid</name>
-            <description>the ORCID working path</description>
-        </property>
-
-    </parameters>
-
-    <global>
-        <job-tracker>${jobTracker}</job-tracker>
-        <name-node>${nameNode}</name-node>
-        <configuration>
-            <property>
-                <name>oozie.action.sharelib.for.spark</name>
-                <value>${oozieActionShareLibForSpark2}</value>
-            </property>
-        </configuration>
-    </global>
-
-    <start to="resume_from"/>
-
-    <decision name="resume_from">
-        <switch>
-            <case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
-            <case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
-            <case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
-            <case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
-            <case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
-            <case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
-            <case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
-            <default to="ImportCrossRef"/>
-        </switch>
-    </decision>
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="ImportCrossRef">
-        <java>
-            <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
-            <arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
-            <arg>--namenode</arg><arg>${nameNode}</arg>
-            <arg>--esServer</arg><arg>${esServer}</arg>
-            <arg>--esIndex</arg><arg>${esIndex}</arg>
-            <arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
-        </java>
-        <ok to="GenerateCrossrefDataset"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <!-- CROSSREF SECTION -->
-
-    <action name="GenerateCrossrefDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>GenerateCrossrefDataset</name>
-            <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="RenameDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="RenameDataset">
-        <fs>
-            <delete path="${inputPathCrossref}/crossref_ds"/>
-            <move source="${inputPathCrossref}/crossref_ds_updated"
-                  target="${inputPathCrossref}/crossref_ds"/>
-        </fs>
-        <ok to="ResetMagWorkingPath"/>
-        <error to="Kill"/>
-    </action>
-
-
-
-    <!-- MAG SECTION -->
-    <action name="ResetMagWorkingPath">
-        <fs>
-            <delete path="${inputPathMAG}/dataset"/>
-            <delete path="${inputPathMAG}/process"/>
-        </fs>
-        <ok to="ConvertMagToDataset"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="ConvertMagToDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert Mag to Dataset</name>
-            <class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
-            <arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="ConvertCrossrefToOAF"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="ConvertCrossrefToOAF">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>ConvertCrossrefToOAF</name>
-            <class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
-            <arg>--targetPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="ProcessMAG"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="ProcessMAG">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert Mag to OAF Dataset</name>
-            <class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorIntersectionMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
-            <arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
-            <arg>--targetPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="ProcessUW"/>
-        <error to="Kill"/>
-    </action>
-
-    <!--  UnpayWall  SECTION -->
-
-    <action name="ProcessUW">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert UnpayWall to Dataset</name>
-            <class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
-            <arg>--targetPath</arg><arg>${workingPath}/uwPublication</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="ProcessORCID"/>
-        <error to="Kill"/>
-    </action>
-
-    <!--  ORCID  SECTION -->
-    <action name="ProcessORCID">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Convert ORCID to Dataset</name>
-            <class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
-            <arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
-            <arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="CreateDOIBoost"/>
-        <error to="Kill"/>
-    </action>
-
-    <!-- INTERSECTION SECTION-->
-    <action name="CreateDOIBoost">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create DOIBoost Infospace</name>
-            <class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorIntersectionMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
-            <arg>--affiliationPath</arg><arg>${inputPathMAG}/dataset/Affiliations</arg>
-            <arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/dataset/PaperAuthorAffiliations</arg>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="GenerateActionSet"/>
-        <error to="Kill"/>
-    </action>
-
-
-    <action name="GenerateActionSet">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Generate DOIBoost ActionSet</name>
-            <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
-            <jar>dhp-doiboost-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=3840
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-            </spark-opts>
-            <arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
-            <arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
-            <arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
-            <arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
-            <arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
-            <arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
-            <arg>--sFilePath</arg><arg>${outputPath}</arg>
-            <arg>--master</arg><arg>yarn-cluster</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <end name="End"/>
-</workflow-app>