Implemented new method for update baseline

This commit is contained in:
Sandro La Bruzzo 2021-10-05 16:34:47 +02:00
parent f258bbb927
commit b84e0cabeb
2 changed files with 79 additions and 29 deletions

View File

@ -4,6 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
import org.apache.http.impl.client.HttpClientBuilder
@ -25,38 +27,78 @@ object SparkDownloadEBILinks {
}
def requestLinks(PMID:Long):String = {
val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
val timeout = 60; // seconds
val config = RequestConfig.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
while (tries > 0) {
println(s"requesting ${r.getURI}")
try {
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
e.printStackTrace()
tries -= 1
def requestPage(url:String):String = {
val r = new HttpGet(url)
val timeout = 60; // seconds
val config = RequestConfig.custom()
.setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000).build()
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4
while (tries > 0) {
println(s"requesting ${r.getURI}")
try {
val response = client.execute(r)
println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) {
tries -= 1
}
else
return IOUtils.toString(response.getEntity.getContent)
} catch {
case e: Throwable =>
println(s"Error on requesting ${r.getURI}")
e.printStackTrace()
tries -= 1
}
""
} finally {
if (client != null)
client.close()
}
""
} finally {
if (client != null)
client.close()
}
}
def requestBaseLineUpdatePage():List[String] = {
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
val end = l.lastIndexOf("\">")
val start = l.indexOf("<a href=\"")
if (start>= 0 && end >start)
l.substring(start+9, (end-start))
else
""
}.filter(s =>s.endsWith(".gz") ).map(s => s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s").toList
result
}
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf)
val p = new Path((baselinePath))
val files = fs.listFiles(p,false)
while (files.hasNext) {
val c = files.next()
c.getPath
}
}
def requestLinks(PMID:Long):String = {
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
}
def main(args: Array[String]): Unit = {

View File

@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
@ -50,6 +51,13 @@ class BioScholixTest extends AbstractVocabularyTest{
}
@Test
def testDownloadEBIUpdate() = {
val data = SparkDownloadEBILinks.requestBaseLineUpdatePage()
println(data)
}
@Test
def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString