forked from D-Net/dnet-hadoop
Implemented method to download delta updates in EBI Links
This commit is contained in:
parent
ccf4103a25
commit
e8b3cb9147
|
@ -8,6 +8,7 @@ import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
|
|||
|
||||
import java.io.IOException
|
||||
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String] {
|
||||
|
||||
var buffer: List[String] = List()
|
||||
|
@ -54,8 +55,6 @@ abstract class AbstractRestClient extends Iterator[String]{
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
|
@ -63,6 +62,7 @@ abstract class AbstractRestClient extends Iterator[String]{
|
|||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
while (tries > 0) {
|
||||
println(s"requesting ${r.getURI}")
|
||||
|
@ -82,6 +82,11 @@ abstract class AbstractRestClient extends Iterator[String]{
|
|||
}
|
||||
}
|
||||
""
|
||||
} finally {
|
||||
if (client != null)
|
||||
client.close()
|
||||
}
|
||||
}
|
||||
|
||||
getBufferData()
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
package eu.dnetlib.dhp.sx.graph.bio.pubmed
|
||||
import java.util.regex.Pattern
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object PubMedToOaf {
|
||||
|
@ -17,7 +17,7 @@ object PubMedToOaf {
|
|||
|
||||
def cleanDoi(doi:String):String = {
|
||||
|
||||
val regex = "10.\\d{4,9}\\/[-._;()\\/:A-Z0-9]+$"
|
||||
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
||||
|
||||
|
||||
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
package eu.dnetlib.dhp.sx.graph.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkDownloadEBILinks {
|
||||
|
||||
|
||||
def createEBILinks(pmid:Long):EBILinkItem = {
|
||||
|
||||
val res = requestLinks(pmid)
|
||||
if (res!=null)
|
||||
return EBILinkItem(pmid, res)
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def requestLinks(PMID:Long):String = {
|
||||
val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
while (tries > 0) {
|
||||
println(s"requesting ${r.getURI}")
|
||||
try {
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
println(s"Error on requesting ${r.getURI}")
|
||||
e.printStackTrace()
|
||||
tries -= 1
|
||||
}
|
||||
}
|
||||
""
|
||||
} finally {
|
||||
if (client != null)
|
||||
client.close()
|
||||
}
|
||||
|
||||
}
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val MAX_ITEM_PER_PARTITION = 20000
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
log.info("Getting max pubmedId where the links have been requested")
|
||||
val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||
|
||||
log.info("Retrieving PMID to request links")
|
||||
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
||||
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
|
||||
|
||||
val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||
|
||||
val total = pmidToReq.count()
|
||||
|
||||
spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
|
||||
|
||||
val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
|
||||
links.union(updates).groupByKey(_.id)
|
||||
.reduceGroups{(x,y) =>
|
||||
if (x == null || x.links ==null)
|
||||
y
|
||||
if (y ==null || y.links ==null)
|
||||
x
|
||||
if (x.links.length > y.links.length)
|
||||
x
|
||||
else
|
||||
y
|
||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath","paramDescription": "the working path ", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,68 @@
|
|||
<configuration>
|
||||
|
||||
<!-- OCEAN -->
|
||||
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- GARR -->
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>jobTracker</name>-->
|
||||
<!-- <value>yarn</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>nameNode</name>-->
|
||||
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>hive_metastore_uris</name>-->
|
||||
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||
<!-- </property>-->
|
||||
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,59 @@
|
|||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="DownloadEBILinks"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DownloadEBILinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Incremental Download EBI Links</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue