Implemented new method for update baseline

This commit is contained in:
Sandro La Bruzzo 2021-10-05 16:34:47 +02:00
parent f258bbb927
commit b84e0cabeb
2 changed files with 79 additions and 29 deletions

View File

@ -4,6 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.config.RequestConfig import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{HttpGet, HttpUriRequest} import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.HttpClientBuilder
@ -25,8 +27,8 @@ object SparkDownloadEBILinks {
} }
def requestLinks(PMID:Long):String = { def requestPage(url:String):String = {
val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json") val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
@ -57,6 +59,46 @@ object SparkDownloadEBILinks {
if (client != null) if (client != null)
client.close() client.close()
} }
}
def requestBaseLineUpdatePage():List[String] = {
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l =>
val end = l.lastIndexOf("\">")
val start = l.indexOf("<a href=\"")
if (start>= 0 && end >start)
l.substring(start+9, (end-start))
else
""
}.filter(s =>s.endsWith(".gz") ).map(s => s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s").toList
result
}
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf)
val p = new Path((baselinePath))
val files = fs.listFiles(p,false)
while (files.hasNext) {
val c = files.next()
c.getPath
}
}
def requestLinks(PMID:Long):String = {
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {

View File

@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -50,6 +51,13 @@ class BioScholixTest extends AbstractVocabularyTest{
} }
@Test
def testDownloadEBIUpdate() = {
val data = SparkDownloadEBILinks.requestBaseLineUpdatePage()
println(data)
}
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString