diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
index 08e060459..eda825bd0 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
@@ -4,6 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
import org.apache.http.impl.client.HttpClientBuilder
@@ -25,38 +27,78 @@ object SparkDownloadEBILinks {
}
- def requestLinks(PMID:Long):String = {
- val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
- val timeout = 60; // seconds
- val config = RequestConfig.custom()
- .setConnectTimeout(timeout * 1000)
- .setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
- val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
- try {
- var tries = 4
- while (tries > 0) {
- println(s"requesting ${r.getURI}")
- try {
- val response = client.execute(r)
- println(s"get response with status${response.getStatusLine.getStatusCode}")
- if (response.getStatusLine.getStatusCode > 400) {
- tries -= 1
- }
- else
- return IOUtils.toString(response.getEntity.getContent)
- } catch {
- case e: Throwable =>
- println(s"Error on requesting ${r.getURI}")
- e.printStackTrace()
- tries -= 1
+ def requestPage(url:String):String = {
+ val r = new HttpGet(url)
+ val timeout = 60; // seconds
+ val config = RequestConfig.custom()
+ .setConnectTimeout(timeout * 1000)
+ .setConnectionRequestTimeout(timeout * 1000)
+ .setSocketTimeout(timeout * 1000).build()
+ val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+ try {
+ var tries = 4
+ while (tries > 0) {
+ println(s"requesting ${r.getURI}")
+ try {
+ val response = client.execute(r)
+ println(s"get response with status${response.getStatusLine.getStatusCode}")
+ if (response.getStatusLine.getStatusCode > 400) {
+ tries -= 1
}
+ else
+ return IOUtils.toString(response.getEntity.getContent)
+ } catch {
+ case e: Throwable =>
+ println(s"Error on requesting ${r.getURI}")
+ e.printStackTrace()
+ tries -= 1
}
- ""
- } finally {
- if (client != null)
- client.close()
}
+ ""
+ } finally {
+ if (client != null)
+ client.close()
+ }
+ }
+
+
+ def requestBaseLineUpdatePage():List[String] = {
+ val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
+
+ val result =data.lines.filter(l => l.startsWith("")
+ val start = l.indexOf("= 0 && end >start)
+ l.substring(start+9, (end-start))
+ else
+ ""
+ }.filter(s =>s.endsWith(".gz") ).map(s => s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s").toList
+
+ result
+ }
+
+ def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
+
+
+ val conf = new Configuration
+ conf.set("fs.defaultFS", hdfsServerUri)
+ val fs = FileSystem.get(conf)
+ val p = new Path((baselinePath))
+ val files = fs.listFiles(p,false)
+
+ while (files.hasNext) {
+ val c = files.next()
+ c.getPath
+
+ }
+
+
+ }
+
+
+ def requestLinks(PMID:Long):String = {
+ requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
}
def main(args: Array[String]): Unit = {
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
index 8e063db7c..b6058f71b 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
@@ -7,6 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
+import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
@@ -50,6 +51,13 @@ class BioScholixTest extends AbstractVocabularyTest{
}
+ @Test
+ def testDownloadEBIUpdate() = {
+ val data = SparkDownloadEBILinks.requestBaseLineUpdatePage()
+ println(data)
+ }
+
+
@Test
def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString