diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/BioDBToOAF.scala
similarity index 83%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/BioDBToOAF.scala
index 90b65c8f70..dffc88c6ca 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/BioDBToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/BioDBToOAF.scala
@@ -1,14 +1,12 @@
-package eu.dnetlib.dhp.sx.graph.bio
+package eu.dnetllib.dhp.sx.bio
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
-import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Instance, KeyValue, Oaf, Relation, StructuredProperty}
+import eu.dnetlib.dhp.schema.oaf._
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
-
-import scala.collection.JavaConverters._
-
+import collection.JavaConverters._
object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {}
@@ -17,23 +15,23 @@ object BioDBToOAF {
case class UniprotDate(date: String, date_info: String) {}
- case class ScholixResolved(pid:String, pidType:String, typology:String, tilte:List[String], datasource:List[String], date:List[String], authors:List[String]){}
+ case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate"
- val resolvedURL:Map[String,String] = Map(
- "genbank"-> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
- "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
- "clinicaltrials.gov"-> "https://clinicaltrials.gov/ct2/show/",
- "onim"-> "https://omim.org/entry/",
- "refseq"-> "https://www.ncbi.nlm.nih.gov/nuccore/",
- "geo"-> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
+ val resolvedURL: Map[String, String] = Map(
+ "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
+ "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
+ "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
+ "onim" -> "https://omim.org/entry/",
+ "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
+ "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
)
@@ -45,7 +43,7 @@ object BioDBToOAF {
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
- val pubmedCollectedFrom:KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
+ val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO)
@@ -58,9 +56,9 @@ object BioDBToOAF {
Map(
"uniprot" -> UNIPROTCollectedFrom,
- "pdb"-> PDBCollectedFrom,
- "elsevier" ->ElsevierCollectedFrom,
- "ebi" ->EBICollectedFrom,
+ "pdb" -> PDBCollectedFrom,
+ "elsevier" -> ElsevierCollectedFrom,
+ "ebi" -> EBICollectedFrom,
"Springer Nature" -> springerNatureCollectedFrom,
"NCBI Nucleotide" -> ncbiCollectedFrom,
"European Nucleotide Archive" -> enaCollectedFrom,
@@ -68,7 +66,7 @@ object BioDBToOAF {
)
}
- def crossrefLinksToOaf(input:String):Oaf = {
+ def crossrefLinksToOaf(input: String): Oaf = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
@@ -77,16 +75,16 @@ object BioDBToOAF {
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
- val relation_semantic= (json \ "RelationshipType" \ "Name").extract[String]
+ val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
- createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type),collectedFromMap("elsevier"),"relationship", relation_semantic, date)
+ createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
}
- def scholixResolvedToOAF(input:ScholixResolved):Oaf = {
+ def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset
@@ -127,18 +125,18 @@ object BioDBToOAF {
d.setInstance(List(i).asJava)
if (input.authors != null && input.authors.nonEmpty) {
- val authors = input.authors.map(a =>{
+ val authors = input.authors.map(a => {
val authorOAF = new Author
authorOAF.setFullname(a)
authorOAF
})
d.setAuthor(authors.asJava)
}
- if (input.date!= null && input.date.nonEmpty) {
- val dt = input.date.head
- i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
- d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
- }
+ if (input.date != null && input.date.nonEmpty) {
+ val dt = input.date.head
+ i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
+ d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
+ }
d
}
@@ -190,7 +188,7 @@ object BioDBToOAF {
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
}
- var i_date:Option[UniprotDate] = None
+ var i_date: Option[UniprotDate] = None
if (dates.nonEmpty) {
i_date = dates.find(d => d.date_info.contains("entry version"))
@@ -218,12 +216,12 @@ object BioDBToOAF {
if (references_pmid != null && references_pmid.nonEmpty) {
- val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
+ val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
rel.getCollectedfrom
List(d, rel)
}
else if (references_doi != null && references_doi.nonEmpty) {
- val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
+ val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
List(d, rel)
}
else
@@ -231,13 +229,12 @@ object BioDBToOAF {
}
-
- def generate_unresolved_id(pid:String, pidType:String) :String = {
+ def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::$pid::$pidType"
}
- def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType:String, relClass:String, date:String):Relation = {
+ def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@@ -251,7 +248,7 @@ object BioDBToOAF {
rel.setTarget(s"unresolved::$pid::$pidType")
- val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
+ val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
@@ -262,8 +259,8 @@ object BioDBToOAF {
}
- def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date:String): Relation = {
- createRelation(pid,pidType,sourceId,collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
+ def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
+ createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
}
@@ -338,7 +335,7 @@ object BioDBToOAF {
def EBITargetLinksFilter(input: EBILinks): Boolean = {
- input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
+ input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
similarity index 86%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
index d66cc84eca..16d2b25a62 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/SparkTransformBioDatabaseToOAF.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala
@@ -1,8 +1,8 @@
-package eu.dnetlib.dhp.sx.graph.bio
+package eu.dnetllib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
-import BioDBToOAF.ScholixResolved
+import eu.dnetlib.dhp.schema.oaf.Oaf
+import eu.dnetllib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
@@ -31,17 +31,16 @@ object SparkTransformBioDatabaseToOAF {
.master(parser.get("master")).getOrCreate()
val sc = spark.sparkContext
- implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
- import spark.implicits._
-
+ implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+ import spark.implicits._
database.toUpperCase() match {
case "UNIPROT" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
- case "PDB"=>
+ case "PDB" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
case "SCHOLIX" =>
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath)
- case "CROSSREF_LINKS"=>
+ case "CROSSREF_LINKS" =>
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath)
}
}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
new file mode 100644
index 0000000000..97b3cdc99a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala
@@ -0,0 +1,202 @@
+package eu.dnetllib.dhp.sx.bio.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
+import eu.dnetlib.dhp.schema.oaf.Result
+import eu.dnetlib.dhp.utils.ISLookupClientFactory
+import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
+import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
+import org.apache.http.client.config.RequestConfig
+import org.apache.http.client.methods.HttpGet
+import org.apache.http.impl.client.HttpClientBuilder
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.expressions.Aggregator
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.io.InputStream
+import scala.io.Source
+import scala.xml.pull.XMLEventReader
+
+object SparkCreateBaselineDataFrame {
+
+
+ def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
+ val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
+
+ val result = data.lines.filter(l => l.startsWith("")
+ val start = l.indexOf("= 0 && end > start)
+ l.substring(start + 9, (end - start))
+ else
+ ""
+ }.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
+
+ result
+ }
+
+
+ def downloadBaselinePart(url: String): InputStream = {
+ val r = new HttpGet(url)
+ val timeout = 60; // seconds
+ val config = RequestConfig.custom()
+ .setConnectTimeout(timeout * 1000)
+ .setConnectionRequestTimeout(timeout * 1000)
+ .setSocketTimeout(timeout * 1000).build()
+ val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+ val response = client.execute(r)
+ println(s"get response with status${response.getStatusLine.getStatusCode}")
+ response.getEntity.getContent
+
+ }
+
+ def requestPage(url: String): String = {
+ val r = new HttpGet(url)
+ val timeout = 60; // seconds
+ val config = RequestConfig.custom()
+ .setConnectTimeout(timeout * 1000)
+ .setConnectionRequestTimeout(timeout * 1000)
+ .setSocketTimeout(timeout * 1000).build()
+ val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+ try {
+ var tries = 4
+ while (tries > 0) {
+ println(s"requesting ${r.getURI}")
+ try {
+ val response = client.execute(r)
+ println(s"get response with status${response.getStatusLine.getStatusCode}")
+ if (response.getStatusLine.getStatusCode > 400) {
+ tries -= 1
+ }
+ else
+ return IOUtils.toString(response.getEntity.getContent)
+ } catch {
+ case e: Throwable =>
+ println(s"Error on requesting ${r.getURI}")
+ e.printStackTrace()
+ tries -= 1
+ }
+ }
+ ""
+ } finally {
+ if (client != null)
+ client.close()
+ }
+ }
+
+
+ def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
+
+
+ val conf = new Configuration
+ conf.set("fs.defaultFS", hdfsServerUri)
+ val fs = FileSystem.get(conf)
+ val p = new Path(baselinePath)
+ val files = fs.listFiles(p, false)
+ var max_file = ""
+ while (files.hasNext) {
+ val c = files.next()
+ val data = c.getPath.toString
+ val fileName = data.substring(data.lastIndexOf("/") + 1)
+
+ if (fileName > max_file)
+ max_file = fileName
+ }
+
+ val files_to_download = requestBaseLineUpdatePage(max_file)
+
+ files_to_download.foreach { u =>
+ val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}")
+ val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
+ val i = downloadBaselinePart(u._2)
+ val buffer = Array.fill[Byte](1024)(0)
+ while (i.read(buffer) > 0) {
+ fsDataOutputStream.write(buffer)
+ }
+ i.close()
+ println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
+ fsDataOutputStream.close()
+ }
+
+ }
+
+
+ val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
+ override def zero: PMArticle = new PMArticle
+
+ override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
+ if (b != null && b.getPmid != null) b else a._2
+ }
+
+ override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
+ if (b1 != null && b1.getPmid != null) b1 else b2
+
+ }
+
+ override def finish(reduction: PMArticle): PMArticle = reduction
+
+ override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
+
+ override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
+ }
+
+
+ def main(args: Array[String]): Unit = {
+ val conf: SparkConf = new SparkConf()
+ val log: Logger = LoggerFactory.getLogger(getClass)
+ val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
+ parser.parseArgument(args)
+ val isLookupUrl: String = parser.get("isLookupUrl")
+ log.info("isLookupUrl: {}", isLookupUrl)
+ val workingPath = parser.get("workingPath")
+ log.info("workingPath: {}", workingPath)
+
+ val targetPath = parser.get("targetPath")
+ log.info("targetPath: {}", targetPath)
+
+ val hdfsServerUri = parser.get("hdfsServerUri")
+ log.info("hdfsServerUri: {}", targetPath)
+
+
+ val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
+ val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
+ val spark: SparkSession =
+ SparkSession
+ .builder()
+ .config(conf)
+ .appName(SparkEBILinksToOaf.getClass.getSimpleName)
+ .master(parser.get("master")).getOrCreate()
+
+ val sc = spark.sparkContext
+ import spark.implicits._
+
+ implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
+ implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
+ implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
+ implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
+
+ downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
+
+ val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline_ftp", 2000)
+ val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
+ val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
+ new PMParser(xml)
+
+ }))
+
+ ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
+ .agg(pmArticleAggregator.toColumn)
+ .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
+
+ val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
+ exported_dataset
+ .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
+ .filter(p => p != null)
+ .write.mode(SaveMode.Overwrite).save(targetPath)
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
new file mode 100644
index 0000000000..578db1ea94
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala
@@ -0,0 +1,117 @@
+package eu.dnetllib.dhp.sx.bio.ebi
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser
+import eu.dnetllib.dhp.sx.bio.BioDBToOAF.EBILinkItem
+import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
+import org.apache.commons.io.IOUtils
+import org.apache.http.client.config.RequestConfig
+import org.apache.http.client.methods.HttpGet
+import org.apache.http.impl.client.HttpClientBuilder
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.functions.max
+import org.apache.spark.sql._
+import org.slf4j.{Logger, LoggerFactory}
+
+object SparkDownloadEBILinks {
+
+ def createEBILinks(pmid: Long): EBILinkItem = {
+
+ val res = requestLinks(pmid)
+ if (res != null)
+ return EBILinkItem(pmid, res)
+ null
+ }
+
+ def requestPage(url: String): String = {
+ val r = new HttpGet(url)
+ val timeout = 60; // seconds
+ val config = RequestConfig.custom()
+ .setConnectTimeout(timeout * 1000)
+ .setConnectionRequestTimeout(timeout * 1000)
+ .setSocketTimeout(timeout * 1000).build()
+ val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
+ try {
+ var tries = 4
+ while (tries > 0) {
+ println(s"requesting ${r.getURI}")
+ try {
+ val response = client.execute(r)
+ println(s"get response with status${response.getStatusLine.getStatusCode}")
+ if (response.getStatusLine.getStatusCode > 400) {
+ tries -= 1
+ }
+ else
+ return IOUtils.toString(response.getEntity.getContent)
+ } catch {
+ case e: Throwable =>
+ println(s"Error on requesting ${r.getURI}")
+ e.printStackTrace()
+ tries -= 1
+ }
+ }
+ ""
+ } finally {
+ if (client != null)
+ client.close()
+ }
+ }
+
+ def requestLinks(PMID: Long): String = {
+ requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
+
+ }
+
+ def main(args: Array[String]): Unit = {
+
+ val log: Logger = LoggerFactory.getLogger(getClass)
+ val MAX_ITEM_PER_PARTITION = 20000
+ val conf: SparkConf = new SparkConf()
+ val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
+ parser.parseArgument(args)
+ val spark: SparkSession =
+ SparkSession
+ .builder()
+ .config(conf)
+ .appName(SparkEBILinksToOaf.getClass.getSimpleName)
+ .master(parser.get("master")).getOrCreate()
+
+ import spark.implicits._
+
+ implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
+ implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
+ implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
+
+ val sourcePath = parser.get("sourcePath")
+ log.info(s"sourcePath -> $sourcePath")
+ val workingPath = parser.get("workingPath")
+ log.info(s"workingPath -> $workingPath")
+
+ log.info("Getting max pubmedId where the links have been requested")
+ val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
+ val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
+
+ log.info("Retrieving PMID to request links")
+ val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
+ pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
+
+ val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
+
+ val total = pmidToReq.count()
+
+ spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
+
+ val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
+
+ links.union(updates).groupByKey(_.id)
+ .reduceGroups { (x, y) =>
+ if (x == null || x.links == null)
+ y
+ if (y == null || y.links == null)
+ x
+ if (x.links.length > y.links.length)
+ x
+ else
+ y
+ }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
+ }
+}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
similarity index 58%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
index f14e5f264a..0db469769c 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkEBILinksToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala
@@ -1,15 +1,14 @@
-package eu.dnetlib.dhp.sx.graph.ebi
+package eu.dnetllib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
-import eu.dnetlib.dhp.sx.graph.bio
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
+import eu.dnetllib.dhp.sx.bio.BioDBToOAF
+import eu.dnetllib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
+import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
+
object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = {
@@ -24,22 +23,17 @@ object SparkEBILinksToOaf {
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
+
+ import spark.implicits._
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
+ implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
- import spark.implicits._
- implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
+ val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
- val ebi_rdd:Dataset[EBILinkItem] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => BioDBToOAF.extractEBILinksFromDump(s))).as[EBILinkItem]
-
- ebi_rdd.write.mode(SaveMode.Overwrite).save(s"${sourcePath}_dataset")
-
- val ebLinks:Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links!= null)
-
- ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))
- .repartition(4000)
+ ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
.write.mode(SaveMode.Overwrite).save(targetPath)
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMArticle.java
similarity index 97%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMArticle.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMArticle.java
index 211cbcffb4..305bb89be0 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMArticle.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMArticle.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetllib.dhp.sx.bio.pubmed;
import java.io.Serializable;
import java.util.ArrayList;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMAuthor.java
similarity index 92%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMAuthor.java
index ba69998c5d..c89929981b 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMAuthor.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMAuthor.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetllib.dhp.sx.bio.pubmed;
import java.io.Serializable;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMGrant.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMGrant.java
similarity index 93%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMGrant.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMGrant.java
index 0c3fd46010..7df5dd5f2f 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMGrant.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMGrant.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetllib.dhp.sx.bio.pubmed;
public class PMGrant {
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMJournal.java
similarity index 94%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMJournal.java
index d251354d47..6065416f8d 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMJournal.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMJournal.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetllib.dhp.sx.bio.pubmed;
import java.io.Serializable;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMParser.scala
similarity index 99%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMParser.scala
index 8744bdfb4c..8fa226b7d5 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMParser.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMParser.scala
@@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetllib.dhp.sx.bio.pubmed
import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMSubject.java
similarity index 94%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMSubject.java
index 354b2cbe5d..e6ab61b875 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PMSubject.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PMSubject.java
@@ -1,5 +1,5 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed;
+package eu.dnetllib.dhp.sx.bio.pubmed;
public class PMSubject {
private String value;
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PubMedToOaf.scala
similarity index 93%
rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PubMedToOaf.scala
index 202eb7b14d..a1777a230c 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/PubMedToOaf.scala
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetllib/dhp/sx/bio/pubmed/PubMedToOaf.scala
@@ -1,11 +1,12 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetllib.dhp.sx.bio.pubmed
+
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
+import eu.dnetlib.dhp.schema.oaf._
+import scala.collection.JavaConverters._
import java.util.regex.Pattern
-import scala.collection.JavaConverters._
object PubMedToOaf {
@@ -15,7 +16,7 @@ object PubMedToOaf {
"doi" -> "https://dx.doi.org/"
)
- def cleanDoi(doi:String):String = {
+ def cleanDoi(doi: String): String = {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
@@ -71,14 +72,14 @@ object PubMedToOaf {
if (article.getPublicationTypes == null)
return null
val i = new Instance
- var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
+ val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null)
return null
- var alternateIdentifier :StructuredProperty = null
+ var alternateIdentifier: StructuredProperty = null
if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi)
- if (normalizedPid!= null)
+ if (normalizedPid != null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
}
@@ -102,10 +103,10 @@ object PubMedToOaf {
return result
result.setDataInfo(dataInfo)
i.setPid(pidList.asJava)
- if (alternateIdentifier!= null)
+ if (alternateIdentifier != null)
i.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(i).asJava)
- i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
+ i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
.filter(t => t._1.nonEmpty)
@@ -136,7 +137,7 @@ object PubMedToOaf {
}
- val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection breakOut)
+ val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
if (subjects != null)
result.setSubject(subjects.asJava)
@@ -148,7 +149,7 @@ object PubMedToOaf {
author.setFullname(a.getFullName)
author.setRank(index + 1)
author
- }(collection breakOut)
+ }(collection.breakOut)
if (authors != null && authors.nonEmpty)
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
new file mode 100644
index 0000000000..4bee770bd5
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json
@@ -0,0 +1,7 @@
+[
+ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
+ {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
+ {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
+ {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true},
+ {"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json
new file mode 100644
index 0000000000..0860ed558b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json
@@ -0,0 +1,5 @@
+[
+ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
+ {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
+ {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path ", "paramRequired": true}
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/config-default.xml
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/config-default.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/config-default.xml
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
similarity index 78%
rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/workflow.xml
rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
index 914d1c2c78..f5a98ba5ef 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/pubmed/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml
@@ -1,13 +1,9 @@
-
+
baselineWorkingPath
the Baseline Working Path
-
- targetPath
- the Target Path
-
isLookupUrl
The IS lookUp service endopoint
@@ -24,8 +20,8 @@
yarn
cluster
- Convert Baseline to Dataset
- eu.dnetlib.dhp.sx.graph.ebi.SparkCreateBaselineDataFrame
+ Convert Baseline to OAF Dataset
+ eu.dnetllib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame
dhp-graph-mapper-${projectVersion}.jar
--executor-memory=${sparkExecutorMemory}
@@ -38,9 +34,10 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--workingPath${baselineWorkingPath}
- --targetPath${targetPath}
+ --targetPath${baselineWorkingPath}/transformed
--masteryarn
--isLookupUrl${isLookupUrl}
+ --hdfsServerUri${nameNode}
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetllib/dhp/sx/bio/BioScholixTest.scala
similarity index 93%
rename from dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetllib/dhp/sx/bio/BioScholixTest.scala
index 8e063db7c0..c072f149ce 100644
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/bio/pubmed/BioScholixTest.scala
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetllib/dhp/sx/bio/BioScholixTest.scala
@@ -1,12 +1,10 @@
-package eu.dnetlib.dhp.sx.graph.bio.pubmed
+package eu.dnetllib.dhp.sx.bio
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
-import eu.dnetlib.dhp.schema.common.ModelConstants
-import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
+import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
-import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
+import eu.dnetllib.dhp.sx.bio.BioDBToOAF.ScholixResolved
+import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
@@ -50,9 +48,11 @@ class BioScholixTest extends AbstractVocabularyTest{
}
+
+
@Test
def testEBIData() = {
- val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
+ val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
}
@@ -62,7 +62,7 @@ class BioScholixTest extends AbstractVocabularyTest{
def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
- val records:String =Source.fromInputStream(getClass.getResourceAsStream("pubmed_dump")).mkString
+ val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/crossref_links b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/crossref_links
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/crossref_links
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/crossref_links
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ls_result b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ls_result
new file mode 100644
index 0000000000..98a0841c45
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/ls_result
@@ -0,0 +1,1433 @@
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0001.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0002.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0003.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0004.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0005.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0006.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0007.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0008.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0009.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0010.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0011.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0012.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0013.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0014.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0015.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0016.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0017.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0018.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0019.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0020.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0021.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0022.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0023.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0024.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0025.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0026.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0027.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0028.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0029.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0030.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0031.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0032.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0033.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0034.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0035.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0036.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0037.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0038.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0039.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0040.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0041.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0042.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0043.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0044.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0045.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0046.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0047.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0048.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0049.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0050.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0051.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0052.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0053.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0054.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0055.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0056.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0057.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0058.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0059.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0060.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0061.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0062.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0063.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0064.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0065.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0066.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0067.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0068.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0069.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0070.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0071.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0072.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0073.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0074.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0075.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0076.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0077.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0078.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0079.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0080.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0081.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0082.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0083.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0084.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0085.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0086.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0087.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0088.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0089.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0090.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0091.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0092.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0093.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0094.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0095.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0096.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0097.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0098.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0099.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0100.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0101.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0102.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0103.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0104.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0105.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0106.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0107.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0108.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0109.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0110.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0111.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0112.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0113.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0114.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0115.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0116.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0117.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0118.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0119.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0120.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0121.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0122.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0123.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0124.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0125.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0126.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0127.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0128.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0129.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0130.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0131.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0132.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0133.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0134.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0135.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0136.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0137.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0138.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0139.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0140.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0141.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0142.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0143.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0144.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0145.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0146.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0147.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0148.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0149.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0150.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0151.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0152.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0153.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0154.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0155.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0156.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0157.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0158.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0159.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0160.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0161.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0162.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0163.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0164.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0165.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0166.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0167.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0168.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0169.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0170.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0171.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0172.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0173.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0174.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0175.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0176.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0177.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0178.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0179.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0180.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0181.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0182.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0183.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0184.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0185.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0186.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0187.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0188.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0189.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0190.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0191.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0192.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0193.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0194.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0195.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0196.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0197.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0198.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0199.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0200.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0201.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0202.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0203.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0204.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0205.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0206.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0207.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0208.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0209.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0210.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0211.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0212.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0213.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0214.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0215.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0216.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0217.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0218.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0219.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0220.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0221.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0222.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0223.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0224.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0225.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0226.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0227.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0228.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0229.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0230.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0231.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0232.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0233.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0234.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0235.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0236.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0237.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0238.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0239.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0240.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0241.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0242.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0243.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0244.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0245.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0246.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0247.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0248.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0249.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0250.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0251.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0252.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0253.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0254.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0255.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0256.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0257.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0258.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0259.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0260.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0261.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0262.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0263.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0264.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0265.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0266.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0267.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0268.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0269.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0270.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0271.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0272.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0273.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0274.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0275.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0276.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0277.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0278.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0279.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0280.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0281.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0282.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0283.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0284.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0285.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0286.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0287.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0288.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0289.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0290.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0291.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0292.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0293.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0294.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0295.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0296.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0297.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0298.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0299.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0300.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0301.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0302.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0303.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0304.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0305.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0306.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0307.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0308.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0309.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0310.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0311.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0312.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0313.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0314.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0315.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0316.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0317.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0318.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0319.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0320.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0321.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0322.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0323.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0324.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0325.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0326.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0327.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0328.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0329.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0330.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0331.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0332.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0333.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0334.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0335.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0336.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0337.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0338.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0339.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0340.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0341.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0342.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0343.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0344.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0345.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0346.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0347.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0348.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0349.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0350.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0351.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0352.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0353.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0354.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0355.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0356.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0357.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0358.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0359.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0360.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0361.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0362.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0363.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0364.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0365.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0366.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0367.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0368.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0369.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0370.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0371.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0372.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0373.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0374.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0375.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0376.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0377.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0378.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0379.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0380.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0381.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0382.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0383.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0384.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0385.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0386.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0387.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0388.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0389.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0390.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0391.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0392.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0393.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0394.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0395.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0396.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0397.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0398.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0399.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0400.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0401.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0402.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0403.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0404.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0405.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0406.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0407.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0408.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0409.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0410.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0411.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0412.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0413.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0414.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0415.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0416.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0417.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0418.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0419.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0420.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0421.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0422.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0423.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0424.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0425.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0426.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0427.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0428.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0429.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0430.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0431.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0432.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0433.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0434.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0435.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0436.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0437.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0438.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0439.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0440.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0441.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0442.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0443.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0444.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0445.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0446.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0447.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0448.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0449.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0450.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0451.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0452.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0453.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0454.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0455.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0456.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0457.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0458.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0459.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0460.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0461.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0462.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0463.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0464.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0465.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0466.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0467.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0468.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0469.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0470.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0471.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0472.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0473.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0474.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0475.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0476.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0477.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0478.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0479.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0480.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0481.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0482.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0483.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0484.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0485.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0486.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0487.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0488.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0489.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0490.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0491.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0492.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0493.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0494.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0495.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0496.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0497.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0498.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0499.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0500.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0501.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0502.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0503.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0504.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0505.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0506.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0507.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0508.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0509.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0510.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0511.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0512.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0513.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0514.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0515.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0516.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0517.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0518.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0519.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0520.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0521.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0522.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0523.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0524.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0525.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0526.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0527.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0528.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0529.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0530.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0531.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0532.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0533.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0534.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0535.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0536.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0537.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0538.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0539.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0540.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0541.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0542.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0543.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0544.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0545.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0546.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0547.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0548.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0549.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0550.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0551.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0552.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0553.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0554.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0555.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0556.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0557.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0558.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0559.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0560.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0561.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0562.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0563.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0564.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0565.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0566.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0567.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0568.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0569.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0570.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0571.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0572.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0573.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0574.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0575.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0576.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0577.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0578.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0579.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0580.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0581.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0582.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0583.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0584.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0585.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0586.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0587.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0588.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0589.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0590.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0591.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0592.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0593.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0594.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0595.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0596.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0597.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0598.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0599.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0600.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0601.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0602.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0603.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0604.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0605.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0606.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0607.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0608.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0609.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0610.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0611.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0612.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0613.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0614.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0615.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0616.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0617.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0618.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0619.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0620.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0621.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0622.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0623.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0624.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0625.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0626.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0627.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0628.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0629.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0630.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0631.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0632.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0633.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0634.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0635.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0636.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0637.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0638.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0639.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0640.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0641.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0642.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0643.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0644.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0645.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0646.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0647.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0648.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0649.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0650.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0651.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0652.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0653.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0654.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0655.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0656.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0657.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0658.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0659.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0660.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0661.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0662.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0663.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0664.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0665.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0666.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0667.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0668.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0669.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0670.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0671.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0672.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0673.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0674.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0675.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0676.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0677.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0678.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0679.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0680.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0681.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0682.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0683.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0684.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0685.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0686.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0687.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0688.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0689.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0690.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0691.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0692.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0693.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0694.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0695.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0696.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0697.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0698.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0699.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0700.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0701.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0702.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0703.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0704.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0705.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0706.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0707.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0708.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0709.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0710.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0711.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0712.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0713.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0714.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0715.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0716.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0717.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0718.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0719.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0720.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0721.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0722.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0723.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0724.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0725.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0726.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0727.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0728.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0729.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0730.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0731.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0732.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0733.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0734.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0735.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0736.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0737.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0738.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0739.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0740.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0741.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0742.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0743.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0744.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0745.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0746.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0747.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0748.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0749.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0750.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0751.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0752.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0753.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0754.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0755.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0756.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0757.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0758.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0759.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0760.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0761.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0762.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0763.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0764.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0765.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0766.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0767.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0768.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0769.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0770.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0771.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0772.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0773.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0774.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0775.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0776.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0777.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0778.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0779.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0780.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0781.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0782.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0783.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0784.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0785.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0786.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0787.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0788.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0789.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0790.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0791.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0792.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0793.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0794.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0795.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0796.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0797.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0798.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0799.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0800.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0801.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0802.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0803.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0804.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0805.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0806.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0807.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0808.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0809.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0810.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0811.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0812.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0813.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0814.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0815.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0816.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0817.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0818.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0819.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0820.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0821.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0822.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0823.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0824.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0825.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0826.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0827.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0828.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0829.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0830.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0831.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0832.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0833.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0834.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0835.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0836.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0837.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0838.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0839.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0840.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0841.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0842.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0843.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0844.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0845.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0846.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0847.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0848.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0849.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0850.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0851.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0852.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0853.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0854.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0855.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0856.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0857.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0858.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0859.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0860.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0861.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0862.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0863.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0864.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0865.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0866.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0867.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0868.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0869.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0870.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0871.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0872.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0873.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0874.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0875.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0876.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0877.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0878.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0879.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0880.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0881.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0882.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0883.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0884.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0885.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0886.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0887.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0888.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0889.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0890.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0891.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0892.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0893.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0894.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0895.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0896.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0897.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0898.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0899.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0900.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0901.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0902.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0903.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0904.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0905.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0906.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0907.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0908.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0909.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0910.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0911.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0912.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0913.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0914.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0915.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0916.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0917.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0918.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0919.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0920.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0921.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0922.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0923.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0924.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0925.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0926.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0927.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0928.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0929.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0930.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0931.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0932.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0933.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0934.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0935.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0936.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0937.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0938.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0939.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0940.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0941.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0942.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0943.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0944.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0945.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0946.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0947.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0948.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0949.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0950.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0951.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0952.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0953.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0954.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0955.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0956.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0957.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0958.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0959.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0960.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0961.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0962.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0963.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0964.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0965.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0966.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0967.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0968.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0969.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0970.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0971.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0972.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0973.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0974.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0975.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0976.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0977.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0978.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0979.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0980.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0981.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0982.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0983.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0984.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0985.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0986.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0987.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0988.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0989.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0990.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0991.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0992.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0993.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0994.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0995.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0996.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0997.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0998.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n0999.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1000.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1001.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1002.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1003.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1004.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1005.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1006.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1007.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1008.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1009.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1010.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1011.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1012.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1013.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1014.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1015.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1016.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1017.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1018.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1019.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1020.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1021.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1022.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1023.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1024.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1025.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1026.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1027.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1028.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1029.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1030.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1031.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1032.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1033.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1034.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1035.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1036.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1037.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1038.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1039.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1040.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1041.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1042.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1043.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1044.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1045.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1046.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1047.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1048.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1049.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1050.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1051.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1052.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1053.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1054.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1055.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1056.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1057.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1058.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1059.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1060.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1061.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1062.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1063.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1064.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1065.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1066.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1067.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1068.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1069.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1070.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1071.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1072.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1073.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1074.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1075.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1076.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1077.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1078.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1079.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1080.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1081.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1082.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1083.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1084.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1085.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1086.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1087.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1088.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1089.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1090.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1091.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1092.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1093.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1094.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1095.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1096.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1097.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1098.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1099.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1100.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1101.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1102.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1103.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1104.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1105.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1106.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1107.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1108.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1109.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1110.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1111.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1112.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1113.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1114.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1115.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1116.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1117.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1118.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1119.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1120.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1121.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1122.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1123.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1124.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1125.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1126.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1127.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1128.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1129.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1130.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1131.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1132.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1133.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1134.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1135.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1136.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1137.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1138.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1139.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1140.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1141.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1142.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1143.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1144.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1145.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1146.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1147.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1148.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1149.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1150.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1151.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1152.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1153.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1154.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1155.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1156.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1157.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1158.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1159.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1160.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1161.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1162.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1163.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1164.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1165.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1166.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1167.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1168.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1169.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1170.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1171.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1172.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1173.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1174.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1175.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1176.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1177.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1178.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1179.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1180.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1181.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1182.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1183.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1184.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1185.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1186.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1187.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1188.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1189.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1190.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1191.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1192.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1193.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1194.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1195.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1196.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1197.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1198.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1199.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1200.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1201.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1202.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1203.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1204.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1205.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1206.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1207.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1208.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1209.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1210.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1211.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1212.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1213.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1214.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1215.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1216.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1217.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1218.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1219.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1220.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1221.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1222.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1223.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1224.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1225.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1226.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1227.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1228.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1229.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1230.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1231.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1232.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1233.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1234.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1235.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1236.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1237.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1238.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1239.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1240.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1241.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1242.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1243.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1244.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1245.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1246.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1247.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1248.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1249.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1250.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1251.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1252.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1253.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1254.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1255.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1256.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1257.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1258.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1259.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1260.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1261.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1262.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1263.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1264.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1265.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1266.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1267.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1268.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1269.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1270.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1271.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1272.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1273.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1274.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1275.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1276.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1277.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1278.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1279.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1280.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1281.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1282.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1283.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1284.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1285.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1286.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1287.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1288.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1289.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1290.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1291.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1292.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1293.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1294.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1295.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1296.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1297.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1298.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1299.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1300.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1301.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1302.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1303.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1304.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1305.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1306.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1307.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1308.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1309.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1310.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1311.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1312.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1313.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1314.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1315.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1316.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1317.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1318.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1319.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1320.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1321.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1322.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1323.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1324.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1325.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1326.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1327.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1328.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1329.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1330.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1331.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1332.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1333.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1334.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1335.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1336.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1337.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1338.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1339.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1340.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1341.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1342.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1343.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1344.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1345.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1346.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1347.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1348.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1349.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1350.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1351.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1352.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1353.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1354.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1355.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1356.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1357.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1358.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1359.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1360.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1361.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1362.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1363.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1364.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1365.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1366.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1367.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1368.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1369.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1370.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1371.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1372.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1373.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1374.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1375.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1376.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1377.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1378.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1379.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1380.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1381.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1382.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1383.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1384.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1385.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1386.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1387.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1388.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1389.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1390.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1391.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1392.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1393.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1394.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1395.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1396.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1397.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1398.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1399.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1400.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1401.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1402.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1403.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1404.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1405.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1406.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1407.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1408.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1409.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1410.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1411.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1412.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1413.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1414.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1415.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1416.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1417.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1418.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1419.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1420.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1421.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1422.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1423.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1424.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1425.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1426.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1427.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1428.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1429.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1430.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1431.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1432.xml.gz
+hdfs://nameservice1/data/scholix/input/baseline/pubmed21n1433.xml.gz
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pdb_dump
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed/pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed/pubmed.xml
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed/pubmed_dump b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed/pubmed_dump
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved
diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump
similarity index 100%
rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java
index 48ba8a6f6d..7d91e47cc9 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java
@@ -6,10 +6,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
@@ -22,7 +19,6 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
@@ -32,7 +28,6 @@ import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
-import scala.Tuple3;
public class SparkWhitelistSimRels extends AbstractSparkAction {
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/UpdateOpenorgsJob.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/UpdateOpenorgsJob.java
new file mode 100644
index 0000000000..d094fb72b7
--- /dev/null
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/UpdateOpenorgsJob.java
@@ -0,0 +1,117 @@
+
+package eu.dnetlib.dhp.oa.dedup;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class UpdateOpenorgsJob {
+
+ private static final Logger log = LoggerFactory.getLogger(UpdateOpenorgsJob.class);
+
+ public static void main(String[] args) throws Exception {
+ ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ SparkCreateSimRels.class
+ .getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateOpenorgsJob_parameters.json")));
+ parser.parseArgument(args);
+
+ final String apiUrl = parser.get("apiUrl");
+ final int delay = Integer.parseInt(parser.get("delay"));
+
+ log.info("apiUrl: '{}'", apiUrl);
+ log.info("delay: '{}'", delay);
+
+ APIResponse res = httpCall(apiUrl);
+ while (res != null && res.getStatus().equals(ImportStatus.RUNNING)) {
+ TimeUnit.MINUTES.sleep(delay);
+ res = httpCall(apiUrl + "/status");
+ }
+
+ if (res == null) {
+ log.error("Openorgs Update FAILED: No response");
+ throw new RuntimeException("Openorgs Update FAILED: No response");
+ }
+
+ if (res.getStatus() == null || !res.getStatus().equals(ImportStatus.SUCCESS)) {
+ log.error("Openorgs Update FAILED: '{}' - '{}'", res.getStatus(), res.getMessage());
+ throw new RuntimeException(res.getMessage());
+ }
+
+ }
+
+ private static APIResponse httpCall(final String url) throws Exception {
+ final HttpGet req = new HttpGet(url);
+
+ try (final CloseableHttpClient client = HttpClients.createDefault()) {
+ try (final CloseableHttpResponse response = client.execute(req)) {
+ final String s = IOUtils.toString(response.getEntity().getContent());
+ return (new ObjectMapper()).readValue(s, APIResponse.class);
+ }
+ }
+ }
+
+}
+
+class APIResponse {
+ private String id;
+ private Long dateStart;
+ private Long dateEnd;
+ private ImportStatus status;
+ private String message;
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public Long getDateStart() {
+ return dateStart;
+ }
+
+ public void setDateStart(Long dateStart) {
+ this.dateStart = dateStart;
+ }
+
+ public Long getDateEnd() {
+ return dateEnd;
+ }
+
+ public void setDateEnd(Long dateEnd) {
+ this.dateEnd = dateEnd;
+ }
+
+ public ImportStatus getStatus() {
+ return status;
+ }
+
+ public void setStatus(ImportStatus status) {
+ this.status = status;
+ }
+
+ public String getMessage() {
+ return message;
+ }
+
+ public void setMessage(String message) {
+ this.message = message;
+ }
+}
+
+enum ImportStatus {
+ SUCCESS, FAILED, RUNNING, NOT_LAUNCHED, NOT_YET_STARTED
+}
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
index 30442406cc..6947019e8b 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml
@@ -28,6 +28,11 @@
dbPwd
password to access the OpenOrgs database
+
+ dbConnections
+ 10
+ number of connections to the postgres db
+
workingPath
path for the working directory
@@ -223,7 +228,7 @@
--dbTable${dbTable}
--dbUser${dbUser}
--dbPwd${dbPwd}
- --numConnections20
+ --numConnections${dbConnections}
@@ -254,19 +259,24 @@
--dbTable${dbTable}
--dbUser${dbUser}
--dbPwd${dbPwd}
- --numConnections20
+ --numConnections${dbConnections}
-
- ${jobTracker}
- ${nameNode}
- /usr/bin/curl
- ${apiUrl}
-
+
+
+
+ oozie.launcher.mapreduce.user.classpath.first
+ true
+
+
+ eu.dnetlib.dhp.oa.dedup.UpdateOpenorgsJob
+ --apiUrl${apiUrl}
+ --delay5
+
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateOpenorgsJob_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateOpenorgsJob_parameters.json
new file mode 100644
index 0000000000..5ca4a3dba6
--- /dev/null
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/updateOpenorgsJob_parameters.json
@@ -0,0 +1,14 @@
+[
+ {
+ "paramName": "api",
+ "paramLongName": "apiUrl",
+ "paramDescription": "the url of the API",
+ "paramRequired": true
+ },
+ {
+ "paramName": "d",
+ "paramLongName": "delay",
+ "paramDescription": "delay for the HTTP call in minutes",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
index e501b48233..501073e74e 100644
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala
@@ -208,7 +208,7 @@ object SparkGenerateDoiBoost {
(r.getTarget,r)
else
("resolved", r)
- })
+ })(Encoders.tuple(Encoders.STRING, mapEncoderRel))
val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2)
@@ -222,7 +222,7 @@ object SparkGenerateDoiBoost {
else
currentRels.setTarget(currentOrgs._1)
currentRels
- }.write.save(s"$workingDirPath/doiBoostPublicationAffiliation")
+ }.filter(r=> !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
val affiliation = item._2
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
deleted file mode 100644
index 26efd723f1..0000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkCreateBaselineDataFrame.scala
+++ /dev/null
@@ -1,93 +0,0 @@
-package eu.dnetlib.dhp.sx.graph.ebi
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
-import eu.dnetlib.dhp.schema.oaf.Result
-import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
-import eu.dnetlib.dhp.utils.ISLookupClientFactory
-import org.apache.commons.io.IOUtils
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.expressions.Aggregator
-import org.apache.spark.sql._
-import org.slf4j.{Logger, LoggerFactory}
-
-import scala.io.Source
-import scala.xml.pull.XMLEventReader
-
-object SparkCreateBaselineDataFrame {
-
-
- val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
- override def zero: PMArticle = new PMArticle
-
- override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
- if (b != null && b.getPmid!= null) b else a._2
- }
-
- override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
- if (b1 != null && b1.getPmid!= null) b1 else b2
-
- }
-
- override def finish(reduction: PMArticle): PMArticle = reduction
-
- override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
-
- override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
- }
-
-
- def main(args: Array[String]): Unit = {
- val conf: SparkConf = new SparkConf()
- val log: Logger = LoggerFactory.getLogger(getClass)
- val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json")))
- parser.parseArgument(args)
- val isLookupUrl: String = parser.get("isLookupUrl")
- log.info("isLookupUrl: {}", isLookupUrl)
- val workingPath = parser.get("workingPath")
- log.info("workingPath: {}", workingPath)
-
- val targetPath = parser.get("targetPath")
- log.info("targetPath: {}", targetPath)
-
- val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
- val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
- val spark: SparkSession =
- SparkSession
- .builder()
- .config(conf)
- .appName(SparkEBILinksToOaf.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
- import spark.implicits._
-
-
- val sc = spark.sparkContext
-
-
-
- implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
- implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
- implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
- implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
-
- val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
- val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
- val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
- new PMParser(xml)
-
- } ))
-
- ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
- .agg(pmArticleAggregator.toColumn)
- .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
-
- val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
- exported_dataset
- .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
- .filter(p => p!= null)
- .write.mode(SaveMode.Overwrite).save(targetPath)
-
- //s"$workingPath/oaf/baseline_oaf"
- }
-}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
deleted file mode 100644
index 08e0604594..0000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ebi/SparkDownloadEBILinks.scala
+++ /dev/null
@@ -1,115 +0,0 @@
-package eu.dnetlib.dhp.sx.graph.ebi
-
-import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
-import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
-import org.apache.commons.io.IOUtils
-import org.apache.http.client.config.RequestConfig
-import org.apache.http.client.methods.{HttpGet, HttpUriRequest}
-import org.apache.http.impl.client.HttpClientBuilder
-import org.apache.spark.SparkConf
-import org.apache.spark.sql.expressions.Aggregator
-import org.apache.spark.sql.functions.max
-import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
-import org.slf4j.{Logger, LoggerFactory}
-
-object SparkDownloadEBILinks {
-
-
- def createEBILinks(pmid:Long):EBILinkItem = {
-
- val res = requestLinks(pmid)
- if (res!=null)
- return EBILinkItem(pmid, res)
- null
- }
-
-
- def requestLinks(PMID:Long):String = {
- val r = new HttpGet(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
- val timeout = 60; // seconds
- val config = RequestConfig.custom()
- .setConnectTimeout(timeout * 1000)
- .setConnectionRequestTimeout(timeout * 1000)
- .setSocketTimeout(timeout * 1000).build()
- val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
- try {
- var tries = 4
- while (tries > 0) {
- println(s"requesting ${r.getURI}")
- try {
- val response = client.execute(r)
- println(s"get response with status${response.getStatusLine.getStatusCode}")
- if (response.getStatusLine.getStatusCode > 400) {
- tries -= 1
- }
- else
- return IOUtils.toString(response.getEntity.getContent)
- } catch {
- case e: Throwable =>
- println(s"Error on requesting ${r.getURI}")
- e.printStackTrace()
- tries -= 1
- }
- }
- ""
- } finally {
- if (client != null)
- client.close()
- }
-
- }
- def main(args: Array[String]): Unit = {
-
- val log: Logger = LoggerFactory.getLogger(getClass)
- val MAX_ITEM_PER_PARTITION = 20000
- val conf: SparkConf = new SparkConf()
- val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json")))
- parser.parseArgument(args)
- val spark: SparkSession =
- SparkSession
- .builder()
- .config(conf)
- .appName(SparkEBILinksToOaf.getClass.getSimpleName)
- .master(parser.get("master")).getOrCreate()
-
- import spark.implicits._
-
- implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
- implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
- implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
-
- val sourcePath = parser.get("sourcePath")
- log.info(s"sourcePath -> $sourcePath")
- val workingPath = parser.get("workingPath")
- log.info(s"workingPath -> $workingPath")
-
- log.info("Getting max pubmedId where the links have been requested")
- val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
- val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0)
-
- log.info("Retrieving PMID to request links")
- val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
- pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
-
- val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
-
- val total = pmidToReq.count()
-
- spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
-
- val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
-
- links.union(updates).groupByKey(_.id)
- .reduceGroups{(x,y) =>
- if (x == null || x.links ==null)
- y
- if (y ==null || y.links ==null)
- x
- if (x.links.length > y.links.length)
- x
- else
- y
- }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
- }
-}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
index bf726cf595..79c75d6df7 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/pangaea/SparkGeneratePanagaeaDataset.scala
@@ -1,7 +1,6 @@
package eu.dnetlib.dhp.sx.graph.pangaea
import eu.dnetlib.dhp.application.ArgumentApplicationParser
-import eu.dnetlib.dhp.sx.graph.ebi.SparkEBILinksToOaf
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json
deleted file mode 100644
index 38eb500940..0000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json
+++ /dev/null
@@ -1,6 +0,0 @@
-[
- {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
- {"paramName":"i", "paramLongName":"isLookupUrl","paramDescription": "isLookupUrl", "paramRequired": true},
- {"paramName":"w", "paramLongName":"workingPath","paramDescription": "the path of the sequencial file to read", "paramRequired": true},
- {"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true}
-]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
deleted file mode 100644
index 0ae19234a3..0000000000
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
- {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
- {"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true},
- {"paramName":"w", "paramLongName":"workingPath","paramDescription": "the working path ", "paramRequired": true}
-]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
index 3f442c5c65..7612321c04 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/oozie_app/workflow.xml
@@ -25,7 +25,6 @@
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
yarn-cluster
@@ -43,6 +42,7 @@
--workingPath${workingPath}
--masteryarn
+ --hdfsServerUri${nameNode}
@@ -74,7 +74,7 @@
yarn-cluster
cluster
- Create Baselnie DataSet
+ Create Baseline DataSet
eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates
dhp-graph-mapper-${projectVersion}.jar
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
index 1b738caed3..cd3bb8c714 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/ebi/update/oozie_app/workflow.xml
@@ -1,59 +1,67 @@
-
-
-
- sourcePath
- the Working Path
-
-
- workingPath
- the Working Path
-
-
- sparkDriverMemory
- memory for driver process
-
-
- sparkExecutorMemory
- memory for individual executor
-
-
- sparkExecutorCores
- number of cores used by single executor
-
-
+
+
+
+ sourcePath
+ the Working Path
+
+
+ workingPath
+ the Working Path
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
-
+
-
- Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
-
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
-
-
- yarn-cluster
- cluster
- Incremental Download EBI Links
- eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
- dhp-graph-mapper-${projectVersion}.jar
-
- --executor-memory=${sparkExecutorMemory}
- --executor-cores=${sparkExecutorCores}
- --driver-memory=${sparkDriverMemory}
- --conf spark.extraListeners=${spark2ExtraListeners}
- --conf spark.sql.shuffle.partitions=2000
- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-
- --sourcePath${sourcePath}
- --workingPath${workingPath}
- --masteryarn
-
-
-
-
-
-
-
\ No newline at end of file
+
+
+ yarn-cluster
+ cluster
+ Incremental Download EBI Links
+ eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
+ dhp-graph-mapper-${projectVersion}.jar
+
+ --executor-memory=${sparkExecutorMemory}
+ --executor-cores=${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners=${spark2ExtraListeners}
+ --conf spark.sql.shuffle.partitions=2000
+ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ --sourcePath${sourcePath}
+ --workingPath${workingPath}
+ --masteryarn
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java
index 6229ad19b4..64935e79d0 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java
+++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java
@@ -89,6 +89,26 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
+ @Test
+ public void testForEOSCFutureAirQualityCopernicus() throws IOException, TransformerException {
+ final String record = IOUtils
+ .toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml"));
+ testRecordTransformation(record);
+ }
+
+ @Test
+ public void testForEOSCFutureB2SharePlotSw() throws IOException, TransformerException {
+ final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-sw.xml"));
+ testRecordTransformation(record);
+ }
+
+ @Test
+ public void testForEOSCFutureB2SharePlotRelatedORP() throws IOException, TransformerException {
+ final String record = IOUtils
+ .toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml"));
+ testRecordTransformation(record);
+ }
+
private void testRecordTransformation(final String record) throws IOException, TransformerException {
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/air-quality-copernicus.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/air-quality-copernicus.xml
new file mode 100644
index 0000000000..43b256bbb3
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/air-quality-copernicus.xml
@@ -0,0 +1,114 @@
+
+
+
+ r37b0ad08687::a8df7db30ae0e4e0b875a098df7b652f
+ 2021-10-07T01:56:56Z
+ under curation
+
+
+
+
+
+
+ Using CAMS European air quality analysis from Copernicus
+ Atmosphere Monitoring with RELIANCE services
+
+ Simone Mantovani
+ 2021-10-07
+
+
+
+ This notebook shows how to discover and access the Copernicus Atmosphere Monitoring products available in the RELIANCE datacube resources.
+ The process is structured in 6 steps, including example of data analysis and visualization with the Python libraries installed in the Jupyter environment
+
+
+ EOSC Jupyter Notebook
+
+ RELIANCE
+
+ Copernicus
+
+ Air quality
+
+
+
+ Zenodo
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ oai:zenodo.org:5554786
+
+ oai:zenodo.org:5554786
+
+ 10.5281/zenodo.5554786
+
+
+
+ false
+ false
+ 0.9
+
+
+
+
+
+
+ corda__h2020::8771f523c34e38902d4921037d545ef8
+
+ REsearch LIfecycle mAnagemeNt for Earth Science Communities and CopErnicus users in EOSC
+ 101017501
+ RELIANCE
+
+
+ ec__________::EC::H2020
+ ec__________::EC::H2020::RIA
+
+
+
+
+
+
+
+
+
+
+
+ https://zenodo.org/record/5554786
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-related-orp.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-related-orp.xml
new file mode 100644
index 0000000000..3c2c6440fe
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-related-orp.xml
@@ -0,0 +1,288 @@
+
+
+
+ doi_dedup___::44fd8a9b5b79adb0783ac245b21e3127
+ 2019-09-19T07:43:31+0000
+ 2019-09-19T07:43:31+0000
+
+
+
+
+
+
+
+ 10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+ 10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+ 6a93c069-a167-44cb-bfe8-74c275637347
+ 50|r3730f562f9e::9b434fedc00d568b8e00611a7fa19f41
+ 10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+ ada23067-496a-494f-bd82-6ffe3cf4f0fb
+ 50|r3730f562f9e::b9cd774e8126b6902d56f9a4aa03e1dc
+ f3bd1041-422c-439d-8e68-c1d0711d130d
+ 50|r3730f562f9e::b847821a0ca5365b0d971dd89dea6bf1
+ 10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+ 10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+ 10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+ HCG16 L-band VLA C+D array final data
+
+
+ Jones, Michael G.
+ 2019-01-01
+ These are the reduced final data associated with the paper Jones et al. 2019 submitted
+ to Astronomy & Astrophysics. They are used by a mybinder (https://gke.mybinder.org/)
+ executable environment to generate the final plots of that paper. The link for this environment
+ is https://mybinder.org/v2/gh/AMIGA-IAA/hcg-16/master. The raw VLA D and C array data of HCG 16
+ were collected by the Very Large Array (http://www.vla.nrao.edu/) in 1989 and 1999, under PI
+ projects of Barbara Williams. The project numbers are AW234 and AW500 respectively. The file
+ also includes a grz colour image and r-band image from DECaLS DR8
+ (http://legacysurvey.org/decamls/), a GBT HI spectrum published in Borthakur et al. 2010 (ApJ
+ 710, 385), an HI data cube from HIPASS (https://www.atnf.csiro.au/research/multibeam/release/),
+ and a source mask (and associated parameters file) for the HIPASS cube generated using SoFiA
+ (https://github.com/SoFiA-Admin/SoFiA-2).
+
+ 3.5.2.1.1 → Observational astronomy →
+ Radio astronomy
+
+ HI
+
+ VLA
+
+ HCG16
+
+ Various
+
+
+ 2019-01-01
+
+ https://b2share.eudat.eu
+
+
+
+ true
+ false
+ 0.8
+ dedup-similarity-result-decisiontree-v2
+
+
+
+
+ userclaim___::ee29372a239b79db3ac4c5debe44d6e6
+
+ Plot scripts for HCG-16 Project
+
+
+
+
+ 2019-01-01
+ HCG16 L-band VLA C+D
+ array final data
+
+
+ B2SHARE
+
+
+ 2019-01-01
+ 10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+ HCG16 L-band VLA C+D array final data
+
+
+ https://b2share.eudat.eu
+
+
+ 2019-01-01
+ HCG16 L-band VLA C+D array final data
+
+
+ 10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+ https://b2share.eudat.eu
+
+
+ 2019-01-01
+ HCG16 L-band VLA C+D
+ array final data
+
+
+ B2SHARE
+
+
+ 2019-01-01
+ HCG16 L-band VLA C+D array final data
+
+
+ https://b2share.eudat.eu
+ 10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+
+
+ 2019-01-01
+ HCG16 L-band VLA C+D
+ array final data
+
+
+ B2SHARE
+
+
+
+
+
+ 2019-01-01
+
+ 10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+
+
+ https://dx.doi.org/10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+
+
+
+
+
+ 2019-01-01
+
+ 10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+
+
+ https://dx.doi.org/10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+
+
+
+
+
+ 2019-01-01
+
+
+ https://doi.org10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+
+
+
+ http://dx.doi.org/https://doi.org/10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+
+
+
+
+
+
+ 2019-01-01
+
+ 10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+
+
+ https://dx.doi.org/10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+
+
+
+
+
+ 2019-01-01
+
+
+ https://doi.org10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+
+
+
+ http://dx.doi.org/https://doi.org/10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+
+
+
+
+
+
+ 2019-01-01
+
+
+ https://doi.org10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+
+
+
+ http://dx.doi.org/https://doi.org/10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-sw.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-sw.xml
new file mode 100644
index 0000000000..5f44f6b1fd
--- /dev/null
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/b2share-plot-sw.xml
@@ -0,0 +1,112 @@
+
+
+
+ userclaim___::ee29372a239b79db3ac4c5debe44d6e6
+ 2021-10-07T12:42:54Z
+
+
+
+
+
+
+ Plot scripts for HCG-16 Project
+
+ Jones, Michael G.
+ Jones, Michael G.
+ 2021-09-30
+
+
+ These are the notebooks to general the final data plots of the paper Jones et al. 2019
+ submitted to Astronomy & Astrophysics. They can be used in a notebooks environment (like
+ https://notebooks.egi.eu/) with the proper libraries installed. A mybinder
+ (https://mybinder.org/)
+ ready version can be started from https://mybinder.org/v2/gh/AMIGA-IAA/hcg-16/master. Data to
+ generate plots is also available from B2SHARE:
+ https://b2share.eudat.eu/records/a69a7b2dcc22449e8734552dde4d3906
+
+
+ EOSC Jupyter Notebook
+
+
+ B2SHARE
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ userclaim___::ee29372a239b79db3ac4c5debe44d6e6
+
+ 10.23728/b2share.adf6e2e942b04561a8640c449b48c14a
+
+
+
+ false
+ false
+ 0.9
+
+
+
+
+
+ doi_dedup___::44fd8a9b5b79adb0783ac245b21e3127
+ HCG16 L-band VLA C+D array final data
+ 2019-01-01
+ https://b2share.eudat.eu
+ 10.23728/b2share.ebcd2972c5fb44199f8b3fdf9f6413c6
+
+ 10.23728/b2share.a69a7b2dcc22449e8734552dde4d3906
+
+ 10.23728/b2share.7c8655b6f25348358b4e6fece7ab6016
+
+
+
+
+
+
+
+
+
+
+
+ 2021-09-30
+
+ http://dx.doi.org/10.23728/b2share.adf6e2e942b04561a8640c449b48c14a
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml
index 23dd6c6ed0..6d2ac7630c 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/data-transfer-pilot.xml
@@ -1,26 +1,25 @@
- r37b0ad08687::dec0d8520e726f2adda9a51280ac7299
- 2021-09-22T08:53:16Z
- under curation
-
+ doi_dedup___::ab57f086011a9ae23d1165211dc6e04b
+ 2020-11-03T05:39:50+0000
+ 2020-11-03T05:39:50+0000
EGI-Foundation/data-transfer-pilot: Include libraries in environment.yml
- Giuseppe La Rocca
- Enol Fernández
- Andrea Manzi
-
+ Giuseppe La Rocca
+ Enol Fernández
+ Andrea Manzi
+ 2020-11-03
This notebook is used to demonstrate how a scientist from one of the PaNOSC RIs can use the resources provided by EGI to perform analysis on the data sets obtained during an expirement.
EOSC Jupyter Notebook
-
+ 2020-11-03
Zenodo
@@ -43,8 +42,8 @@
oai:zenodo.org:4218562
oai:zenodo.org:4218562
- 10.5281/zenodo.4218562
-
+ 10.5281/zenodo.4195418
+ 10.5281/zenodo.4218562
false
false
@@ -59,7 +58,7 @@
-
+ 2020-11-03
https://zenodo.org/record/4218562
diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml
index 9995b902f6..9ab9b98614 100644
--- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml
+++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/eosc-future/training-notebooks-seadatanet.xml
@@ -1,10 +1,8 @@
- r37b0ad08687::eb430fb7438e1533ba95d6aa50a477eb
+ doi_dedup___::8539a8de8996e01350f0de8ca4899b7f
2021-09-22T08:53:13Z
- under curation
-
EGI-Foundation/training-notebooks-seadatanet: Version 0.4
Enol Fernández
-
+ 2019-12-04
- A sample notebook using SeaDataNet data to plot a map that shows surface temperature of Black Sea, Arctic Sea and Baltic Sea. The data is available at EGI DataHub with PID http://hdl.handle.net/21.T15999/qVk6JWQ (run at EGI Notebooks service for easy access to data).This release updates the PID for the data.
+ A sample notebook using SeaDataNet data to plot a map that shows surface temperature of Black Sea, Arctic Sea and Baltic Sea. The data is available at EGI DataHub with PID http://hdl.handle.net/21.T15999/3Byz9Cw (run at EGI Notebooks service for easy access to data). This release uses the correct path of the data share from the EGI DataHub.
EOSC Jupyter Notebook
@@ -43,6 +41,9 @@
oai:zenodo.org:3561323
10.5281/zenodo.3561323
+ 10.5281/zenodo.3443996
+ 10.5281/zenodo.3475539
+ 10.5281/zenodo.3475785
false
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh
index 92543b8b8c..55a308c50c 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/observatory-pre.sh
@@ -13,4 +13,4 @@ export SHADOW=$3
echo "Creating observatory database"
impala-shell -q "drop database if exists ${TARGET} cascade"
impala-shell -q "create database if not exists ${TARGET}"
-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
\ No newline at end of file
+impala-shell -d ${SOURCE} -q "show tables" --delimited | grep -iv roar | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
index 020787039f..0ea4a5adc5 100644
--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql
@@ -233,4 +233,50 @@ on p.id= tmp.id;
create table indi_pub_has_abstract stored as parquet as
select distinct publication.id, coalesce(abstract, 1) has_abstract
-from publication;
\ No newline at end of file
+from publication;
+
+create table indi_with_orcid stored as parquet as
+select distinct r.id, coalesce(has_orcid, 0) as has_orcid
+from result r
+left outer join (select id, 1 as has_orcid from result_orcid) tmp
+on r.id= tmp.id
+
+create table indi_funded_result_with_fundref stored as parquet as
+select distinct r.id, coalesce(fundref, 0) as fundref
+from project_results r
+left outer join (select distinct id, 1 as fundref from project_results
+where provenance='Harvested') tmp
+on r.id= tmp.id
+
+create table indi_result_org_country_collab stored as parquet as
+with tmp as
+(select o.id as id, o.country , ro.id as result,r.type from organization o
+join result_organization ro on o.id=ro.organization
+join result r on r.id=ro.id where o.country <> 'UNKNOWN')
+select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
+from tmp as o1
+join tmp as o2 on o1.result=o2.result
+where o1.id<>o2.id and o1.country<>o2.country
+group by o1.id, o1.type,o2.country
+
+create table indi_result_org_collab stored as parquet as
+with tmp as
+(select o.id, ro.id as result,r.type from organization o
+join result_organization ro on o.id=ro.organization
+join result r on r.id=ro.id)
+select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
+from tmp as o1
+join tmp as o2 on o1.result=o2.result
+where o1.id<>o2.id
+group by o1.id, o2.id, o1.type
+
+create table indi_result_org_country_collab stored as parquet as
+with tmp as
+(select o.id as id, o.country , ro.id as result,r.type from organization o
+join result_organization ro on o.id=ro.organization
+join result r on r.id=ro.id where o.country <> 'UNKNOWN')
+select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
+from tmp as o1
+join tmp as o2 on o1.result=o2.result
+where o1.id<>o2.id and o1.country<>o2.country
+group by o1.id, o1.type,o2.country