forked from D-Net/dnet-hadoop
Implemented new method for update baseline
This commit is contained in:
parent
f258bbb927
commit
b84e0cabeb
|
@ -4,6 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||||
import org.apache.http.client.config.RequestConfig
|
import org.apache.http.client.config.RequestConfig
|
||||||
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
|
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
|
||||||
import org.apache.http.impl.client.HttpClientBuilder
|
import org.apache.http.impl.client.HttpClientBuilder
|
||||||
|
@ -25,38 +27,78 @@ object SparkDownloadEBILinks {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def requestLinks(PMID:Long):String = {
|
def requestPage(url:String):String = {
|
||||||
val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
val r = new HttpGet(url)
|
||||||
val timeout = 60; // seconds
|
val timeout = 60; // seconds
|
||||||
val config = RequestConfig.custom()
|
val config = RequestConfig.custom()
|
||||||
.setConnectTimeout(timeout * 1000)
|
.setConnectTimeout(timeout * 1000)
|
||||||
.setConnectionRequestTimeout(timeout * 1000)
|
.setConnectionRequestTimeout(timeout * 1000)
|
||||||
.setSocketTimeout(timeout * 1000).build()
|
.setSocketTimeout(timeout * 1000).build()
|
||||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||||
try {
|
try {
|
||||||
var tries = 4
|
var tries = 4
|
||||||
while (tries > 0) {
|
while (tries > 0) {
|
||||||
println(s"requesting ${r.getURI}")
|
println(s"requesting ${r.getURI}")
|
||||||
try {
|
try {
|
||||||
val response = client.execute(r)
|
val response = client.execute(r)
|
||||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
}
|
|
||||||
else
|
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
|
||||||
} catch {
|
|
||||||
case e: Throwable =>
|
|
||||||
println(s"Error on requesting ${r.getURI}")
|
|
||||||
e.printStackTrace()
|
|
||||||
tries -= 1
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
return IOUtils.toString(response.getEntity.getContent)
|
||||||
|
} catch {
|
||||||
|
case e: Throwable =>
|
||||||
|
println(s"Error on requesting ${r.getURI}")
|
||||||
|
e.printStackTrace()
|
||||||
|
tries -= 1
|
||||||
}
|
}
|
||||||
""
|
|
||||||
} finally {
|
|
||||||
if (client != null)
|
|
||||||
client.close()
|
|
||||||
}
|
}
|
||||||
|
""
|
||||||
|
} finally {
|
||||||
|
if (client != null)
|
||||||
|
client.close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def requestBaseLineUpdatePage():List[String] = {
|
||||||
|
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||||
|
|
||||||
|
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
|
||||||
|
val end = l.lastIndexOf("\">")
|
||||||
|
val start = l.indexOf("<a href=\"")
|
||||||
|
|
||||||
|
if (start>= 0 && end >start)
|
||||||
|
l.substring(start+9, (end-start))
|
||||||
|
else
|
||||||
|
""
|
||||||
|
}.filter(s =>s.endsWith(".gz") ).map(s => s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s").toList
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
|
||||||
|
|
||||||
|
|
||||||
|
val conf = new Configuration
|
||||||
|
conf.set("fs.defaultFS", hdfsServerUri)
|
||||||
|
val fs = FileSystem.get(conf)
|
||||||
|
val p = new Path((baselinePath))
|
||||||
|
val files = fs.listFiles(p,false)
|
||||||
|
|
||||||
|
while (files.hasNext) {
|
||||||
|
val c = files.next()
|
||||||
|
c.getPath
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def requestLinks(PMID:Long):String = {
|
||||||
|
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||||
|
|
||||||
}
|
}
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
|
@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
|
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
|
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
|
||||||
|
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -50,6 +51,13 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testDownloadEBIUpdate() = {
|
||||||
|
val data = SparkDownloadEBILinks.requestBaseLineUpdatePage()
|
||||||
|
println(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
||||||
|
|
Loading…
Reference in New Issue