refactor code for EBI from dhp-graph-mapper into dhp-aggregation

2021-10-14 14:23:05 +02:00 · 2021-10-14 14:23:05 +02:00 · 51a03c0a50
parent dd568ec88b
commit 51a03c0a50
7 changed files with 109 additions and 238 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
@ -66,7 +66,7 @@ object SparkDownloadEBILinks {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val MAX_ITEM_PER_PARTITION = 20000
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -86,7 +86,7 @@ object SparkDownloadEBILinks {
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")

-    log.info("Getting max pubmedId where the links have been requested")
+    log.info("Getting max pubmedId where the links have already requested")
    val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
    val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@ -14,7 +14,7 @@ object SparkEBILinksToOaf {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -31,7 +31,7 @@ object SparkEBILinksToOaf {
    log.info(s"targetPath  -> $targetPath")
    implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])

-    val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
+    val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))

    ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
      .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
@ -0,0 +1,105 @@
+<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the Working Path</description>
+        </property>
+        <property>
+            <name>workingPath</name>
+            <description>the Working Path</description>
+        </property>
+        <property>
+            <name>targetPath</name>
+            <description>the OAF MDStore Path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>resumeFrom</name>
+            <value>DownloadEBILinks</value>
+            <description>node to start</description>
+        </property>
+    </parameters>
+
+    <start to="resume_from"/>
+
+    <decision name="resume_from">
+        <switch>
+            <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
+            <case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
+            <default to="DownloadEBILinks"/>
+        </switch>
+    </decision>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+
+    <action name="DownloadEBILinks">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Incremental Download EBI Links</name>
+            <class>eu.dnetllib.dhp.sx.bio.ebi.SparkDownloadEBILinks</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=2000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="OverrideFolders"/>
+        <error to="Kill"/>
+    </action>
+    <action name="OverrideFolders">
+        <fs>
+            <delete path="${sourcePath}/ebi_links_dataset_old"/>
+            <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
+            <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
+        </fs>
+        <ok to="CreateEBIDataSet"/>
+        <error to="Kill"/>
+    </action>
+    <action name="CreateEBIDataSet">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Create OAF DataSet</name>
+            <class>eu.dnetllib.dhp.sx.bio.ebi.SparkEBILinksToOaf</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.sql.shuffle.partitions=2000
+                ${sparkExtraOPT}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
+            <arg>--targetPath</arg><arg>${targetPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
@ -1,99 +0,0 @@
-<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
-    <parameters>
-        <property>
-            <name>workingPath</name>
-            <description>the Working Path</description>
-        </property>
-        <property>
-            <name>sparkDriverMemory</name>
-            <description>memory for driver process</description>
-        </property>
-        <property>
-            <name>sparkExecutorMemory</name>
-            <description>memory for individual executor</description>
-        </property>
-        <property>
-            <name>sparkExecutorCores</name>
-            <description>number of cores used by single executor</description>
-        </property>
-    </parameters>
-
-    <start to="CreateEBIDataSet"/>
-
-
-    <kill name="Kill">
-        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-    </kill>
-
-    <action name="GenerateBaselineDataset">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Baselnie DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=1
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="CreateEBIDataSet">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create EBI DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkEBILinksToOaf</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
-                --conf spark.sql.shuffle.partitions=1000
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-        </spark>
-        <ok to="GenerateUpdates"/>
-        <error to="Kill"/>
-    </action>
-
-    <action name="GenerateUpdates">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn-cluster</master>
-            <mode>cluster</mode>
-            <name>Create Baseline DataSet</name>
-
-            <class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
-            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
-                --executor-cores=1
-                --driver-memory=${sparkDriverMemory}
-                --executor-cores=${sparkExecutorCores}
-                ${sparkExtraOPT}
-            </spark-opts>
-            <arg>--workingPath</arg><arg>${workingPath}</arg>
-            <arg>--master</arg><arg>yarn</arg>
-        </spark>
-        <ok to="End"/>
-        <error to="Kill"/>
-    </action>
-
-
-
-
-    <end name="End"/>
-</workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
@ -1,68 +0,0 @@
-<configuration>
-
-    <!-- OCEAN  -->
-
-    <property>
-        <name>jobTracker</name>
-        <value>yarnRM</value>
-    </property>
-    <property>
-        <name>nameNode</name>
-        <value>hdfs://nameservice1</value>
-    </property>
-    <property>
-        <name>hive_metastore_uris</name>
-        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
-    </property>
-    <property>
-        <name>spark2YarnHistoryServerAddress</name>
-        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
-    </property>
-
-
-    <!-- GARR  -->
-
-<!--    <property>-->
-<!--        <name>jobTracker</name>-->
-<!--        <value>yarn</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>nameNode</name>-->
-<!--        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>hive_metastore_uris</name>-->
-<!--        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
-<!--    </property>-->
-<!--    <property>-->
-<!--        <name>spark2YarnHistoryServerAddress</name>-->
-<!--        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
-<!--    </property>-->
-
-
-    <property>
-        <name>oozie.launcher.mapreduce.user.classpath.first</name>
-        <value>true</value>
-    </property>
-
-    <property>
-        <name>oozie.use.system.libpath</name>
-        <value>true</value>
-    </property>
-    <property>
-        <name>oozie.action.sharelib.for.spark</name>
-        <value>spark2</value>
-    </property>
-    <property>
-        <name>spark2EventLogDir</name>
-        <value>/user/spark/spark2ApplicationHistory</value>
-    </property>
-    <property>
-        <name>spark2ExtraListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
-    </property>
-    <property>
-        <name>spark2SqlQueryExecutionListeners</name>
-        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
-    </property>
-</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
@ -1,67 +0,0 @@
-    <workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
-        <parameters>
-            <property>
-                <name>sourcePath</name>
-                <description>the Working Path</description>
-            </property>
-            <property>
-                <name>workingPath</name>
-                <description>the Working Path</description>
-            </property>
-            <property>
-                <name>sparkDriverMemory</name>
-                <description>memory for driver process</description>
-            </property>
-            <property>
-                <name>sparkExecutorMemory</name>
-                <description>memory for individual executor</description>
-            </property>
-            <property>
-                <name>sparkExecutorCores</name>
-                <description>number of cores used by single executor</description>
-            </property>
-        </parameters>
-
-        <start to="DownloadEBILinks"/>
-
-
-        <kill name="Kill">
-            <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-        </kill>
-
-
-        <action name="DownloadEBILinks">
-            <spark xmlns="uri:oozie:spark-action:0.2">
-                <master>yarn-cluster</master>
-                <mode>cluster</mode>
-                <name>Incremental Download EBI Links</name>
-                <class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
-                <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
-                <spark-opts>
-                    --executor-memory=${sparkExecutorMemory}
-                    --executor-cores=${sparkExecutorCores}
-                    --driver-memory=${sparkDriverMemory}
-                    --conf spark.extraListeners=${spark2ExtraListeners}
-                    --conf spark.sql.shuffle.partitions=2000
-                    --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                    --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                    --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                </spark-opts>
-                <arg>--sourcePath</arg><arg>${sourcePath}</arg>
-                <arg>--workingPath</arg><arg>${workingPath}</arg>
-                <arg>--master</arg><arg>yarn</arg>
-            </spark>
-            <ok to="OverrideFolders"/>
-            <error to="Kill"/>
-        </action>
-        <action name="OverrideFolders">
-            <fs>
-                <delete path="${sourcePath}/ebi_links_dataset_old"/>
-                <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
-                <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
-            </fs>
-            <ok to="End"/>
-            <error to="Kill"/>
-        </action>
-        <end name="End"/>
-    </workflow-app>