refactor code for EBI from dhp-graph-mapper into dhp-aggregation

2021-10-14 14:23:05 +02:00 · 2021-10-14 14:23:05 +02:00 · 51a03c0a50
parent dd568ec88b
commit 51a03c0a50
7 changed files with 109 additions and 238 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
@ -66,7 +66,7 @@ object SparkDownloadEBILinks {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val MAX_ITEM_PER_PARTITION = 20000
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -86,7 +86,7 @@ object SparkDownloadEBILinks {
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")
-    log.info("Getting max pubmedId where the links have been requested")
+    log.info("Getting max pubmedId where the links have already requested")
    val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
    val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@ -14,7 +14,7 @@ object SparkEBILinksToOaf {
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
-    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json")))
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
@ -31,7 +31,7 @@ object SparkEBILinksToOaf {
    log.info(s"targetPath  -> $targetPath")
    implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
-    val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
+    val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
    ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
      .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml
@ -0,0 +1,105 @@
 <workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>sourcePath</name>
            <description>the Working Path</description>
        </property>
        <property>
            <name>workingPath</name>
            <description>the Working Path</description>
        </property>
        <property>
            <name>targetPath</name>
            <description>the OAF MDStore Path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>resumeFrom</name>
            <value>DownloadEBILinks</value>
            <description>node to start</description>
        </property>
    </parameters>
    <start to="resume_from"/>
    <decision name="resume_from">
        <switch>
            <case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
            <case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
            <default to="DownloadEBILinks"/>
        </switch>
    </decision>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="DownloadEBILinks">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Incremental Download EBI Links</name>
            <class>eu.dnetllib.dhp.sx.bio.ebi.SparkDownloadEBILinks</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=2000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="OverrideFolders"/>
        <error to="Kill"/>
    </action>
    <action name="OverrideFolders">
        <fs>
            <delete path="${sourcePath}/ebi_links_dataset_old"/>
            <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
            <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
        </fs>
        <ok to="CreateEBIDataSet"/>
        <error to="Kill"/>
    </action>
    <action name="CreateEBIDataSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create OAF DataSet</name>
            <class>eu.dnetllib.dhp.sx.bio.ebi.SparkEBILinksToOaf</class>
            <jar>dhp-aggregation-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=2000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
            <arg>--targetPath</arg><arg>${targetPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
@ -1,99 +0,0 @@
 <workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
    <parameters>
        <property>
            <name>workingPath</name>
            <description>the Working Path</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
    </parameters>
    <start to="CreateEBIDataSet"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
    <action name="GenerateBaselineDataset">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create Baselnie DataSet</name>
            <class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=1
                --driver-memory=${sparkDriverMemory}
                --executor-cores=${sparkExecutorCores}
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <action name="CreateEBIDataSet">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create EBI DataSet</name>
            <class>eu.dnetlib.dhp.sx.ebi.SparkEBILinksToOaf</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.sql.shuffle.partitions=1000
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="GenerateUpdates"/>
        <error to="Kill"/>
    </action>
    <action name="GenerateUpdates">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Create Baseline DataSet</name>
            <class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=1
                --driver-memory=${sparkDriverMemory}
                --executor-cores=${sparkExecutorCores}
                ${sparkExtraOPT}
            </spark-opts>
            <arg>--workingPath</arg><arg>${workingPath}</arg>
            <arg>--master</arg><arg>yarn</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
@ -1,68 +0,0 @@
 <configuration>
    <!-- OCEAN  -->
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>hive_metastore_uris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>spark2YarnHistoryServerAddress</name>
        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
    </property>
    <!-- GARR  -->
 <!--    <property>-->
 <!--        <name>jobTracker</name>-->
 <!--        <value>yarn</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>nameNode</name>-->
 <!--        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>hive_metastore_uris</name>-->
 <!--        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
 <!--    </property>-->
 <!--    <property>-->
 <!--        <name>spark2YarnHistoryServerAddress</name>-->
 <!--        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
 <!--    </property>-->
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
    <property>
        <name>spark2EventLogDir</name>
        <value>/user/spark/spark2ApplicationHistory</value>
    </property>
    <property>
        <name>spark2ExtraListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
    </property>
    <property>
        <name>spark2SqlQueryExecutionListeners</name>
        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
@ -1,67 +0,0 @@
    <workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
        <parameters>
            <property>
                <name>sourcePath</name>
                <description>the Working Path</description>
            </property>
            <property>
                <name>workingPath</name>
                <description>the Working Path</description>
            </property>
            <property>
                <name>sparkDriverMemory</name>
                <description>memory for driver process</description>
            </property>
            <property>
                <name>sparkExecutorMemory</name>
                <description>memory for individual executor</description>
            </property>
            <property>
                <name>sparkExecutorCores</name>
                <description>number of cores used by single executor</description>
            </property>
        </parameters>
        <start to="DownloadEBILinks"/>
        <kill name="Kill">
            <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
        </kill>
        <action name="DownloadEBILinks">
            <spark xmlns="uri:oozie:spark-action:0.2">
                <master>yarn-cluster</master>
                <mode>cluster</mode>
                <name>Incremental Download EBI Links</name>
                <class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
                <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
                <spark-opts>
                    --executor-memory=${sparkExecutorMemory}
                    --executor-cores=${sparkExecutorCores}
                    --driver-memory=${sparkDriverMemory}
                    --conf spark.extraListeners=${spark2ExtraListeners}
                    --conf spark.sql.shuffle.partitions=2000
                    --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                    --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                    --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                </spark-opts>
                <arg>--sourcePath</arg><arg>${sourcePath}</arg>
                <arg>--workingPath</arg><arg>${workingPath}</arg>
                <arg>--master</arg><arg>yarn</arg>
            </spark>
            <ok to="OverrideFolders"/>
            <error to="Kill"/>
        </action>
        <action name="OverrideFolders">
            <fs>
                <delete path="${sourcePath}/ebi_links_dataset_old"/>
                <move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
                <move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
            </fs>
            <ok to="End"/>
            <error to="Kill"/>
        </action>
        <end name="End"/>
    </workflow-app>