Implemented method to download delta updates in EBI Links

2021-08-30 09:32:21 +02:00 · 2021-08-30 09:32:21 +02:00 · e8b3cb9147
parent ccf4103a25
commit e8b3cb9147
6 changed files with 269 additions and 17 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala
@ -8,6 +8,7 @@ import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}

 import java.io.IOException

+
 abstract class AbstractRestClient extends Iterator[String] {

  var buffer: List[String] = List()
@ -54,8 +55,6 @@ abstract class AbstractRestClient extends Iterator[String]{
  }


-
-
  private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
    val timeout = 60; // seconds
    val config = RequestConfig.custom()
@ -63,6 +62,7 @@ abstract class AbstractRestClient extends Iterator[String]{
      .setConnectionRequestTimeout(timeout * 1000)
      .setSocketTimeout(timeout * 1000).build()
    val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+    try {
      var tries = 4
      while (tries > 0) {
        println(s"requesting ${r.getURI}")
@ -82,6 +82,11 @@ abstract class AbstractRestClient extends Iterator[String]{
        }
      }
      ""
+    } finally {
+      if (client != null)
+        client.close()
    }
+  }
+
  getBufferData()
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
@ -1,10 +1,10 @@
 package eu.dnetlib.dhp.sx.graph.bio.pubmed
-import java.util.regex.Pattern
 import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
 import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
 import eu.dnetlib.dhp.schema.oaf._
+import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}

+import java.util.regex.Pattern
 import scala.collection.JavaConverters._

 object PubMedToOaf {
@ -17,7 +17,7 @@ object PubMedToOaf {

  def cleanDoi(doi:String):String = {

-    val regex = "10.\\d{4,9}\\/[-._;()\\/:A-Z0-9]+$"
+    val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"


    val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
@ -0,0 +1,115 @@
+package eu.dnetlib.dhp.sx.graph.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
+import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
+import org.apache.commons.io.IOUtils
+import org.apache.http.client.config.RequestConfig
+import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
+import org.apache.http.impl.client.HttpClientBuilder
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql.functions.max
+import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkDownloadEBILinks {
+
+
+  def createEBILinks(pmid:Long):EBILinkItem = {
+
+    val res = requestLinks(pmid)
+    if (res!=null)
+      return EBILinkItem(pmid, res)
+    null
+  }
+
+
+  def requestLinks(PMID:Long):String = {
+    val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
+      val timeout = 60; // seconds
+      val config = RequestConfig.custom()
+        .setConnectTimeout(timeout * 1000)
+        .setConnectionRequestTimeout(timeout * 1000)
+        .setSocketTimeout(timeout * 1000).build()
+      val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+      try {
+        var tries = 4
+        while (tries > 0) {
+          println(s"requesting ${r.getURI}")
+          try {
+            val response = client.execute(r)
+            println(s"get response with status${response.getStatusLine.getStatusCode}")
+            if (response.getStatusLine.getStatusCode > 400) {
+              tries -= 1
+            }
+            else
+              return IOUtils.toString(response.getEntity.getContent)
+          } catch {
+            case e: Throwable =>
+              println(s"Error on requesting ${r.getURI}")
+              e.printStackTrace()
+              tries -= 1
+          }
+        }
+        ""
+      } finally {
+        if (client != null)
+          client.close()
+      }
+
+  }
+  def main(args: Array[String]): Unit = {
+
+    val log: Logger = LoggerFactory.getLogger(getClass)
+    val MAX_ITEM_PER_PARTITION = 20000
+    val conf: SparkConf = new SparkConf()
+    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+    parser.parseArgument(args)
+    val spark: SparkSession =
+      SparkSession
+        .builder()
+        .config(conf)
+        .appName(SparkEBILinksToOaf.getClass.getSimpleName)
+        .master(parser.get("master")).getOrCreate()
+
+    import spark.implicits._
+
+    implicit  val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
+    implicit  val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
+    implicit  val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
+
+    val sourcePath = parser.get("sourcePath")
+    log.info(s"sourcePath  -> $sourcePath")
+    val workingPath = parser.get("workingPath")
+    log.info(s"workingPath  -> $workingPath")
+
+    log.info("Getting max pubmedId where the links have been requested")
+    val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
+    val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0)
+
+    log.info("Retrieving PMID to request links")
+    val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
+    pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
+
+    val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
+
+    val total = pmidToReq.count()
+
+    spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
+
+    val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
+
+    links.union(updates).groupByKey(_.id)
+      .reduceGroups{(x,y) =>
+        if (x == null || x.links ==null)
+          y
+        if (y ==null || y.links ==null)
+          x
+        if (x.links.length > y.links.length)
+            x
+        else
+          y
+      }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
+  }
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
@ -0,0 +1,5 @@
+[
+  {"paramName":"mt",  "paramLongName":"master",     "paramDescription": "should be local or yarn",    "paramRequired": true},
+  {"paramName":"s",   "paramLongName":"sourcePath","paramDescription": "the source Path",             "paramRequired": true},
+  {"paramName":"w",   "paramLongName":"workingPath","paramDescription": "the  working path ",         "paramRequired": true}
+]
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/config-default.xml
@ -0,0 +1,68 @@
+<configuration>
+
+    <!-- OCEAN  -->
+
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+
+
+    <!-- GARR  -->
+
+<!--    <property>-->
+<!--        <name>jobTracker</name>-->
+<!--        <value>yarn</value>-->
+<!--    </property>-->
+<!--    <property>-->
+<!--        <name>nameNode</name>-->
+<!--        <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
+<!--    </property>-->
+<!--    <property>-->
+<!--        <name>hive_metastore_uris</name>-->
+<!--        <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
+<!--    </property>-->
+<!--    <property>-->
+<!--        <name>spark2YarnHistoryServerAddress</name>-->
+<!--        <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
+<!--    </property>-->
+
+
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
@ -0,0 +1,59 @@
+<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sourcePath</name>
+            <description>the Working Path</description>
+        </property>
+        <property>
+            <name>workingPath</name>
+            <description>the Working Path</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+    </parameters>
+
+    <start to="DownloadEBILinks"/>
+
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+
+    <action name="DownloadEBILinks">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn-cluster</master>
+            <mode>cluster</mode>
+            <name>Incremental Download EBI Links</name>
+            <class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.shuffle.partitions=2000
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
+            <arg>--workingPath</arg><arg>${workingPath}</arg>
+            <arg>--master</arg><arg>yarn</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>