promoting beta branch to master - October 2021 #149
|
@ -1,14 +1,12 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.bio
|
package eu.dnetllib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Instance, KeyValue, Oaf, Relation, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
|
import collection.JavaConverters._
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object BioDBToOAF {
|
object BioDBToOAF {
|
||||||
|
|
||||||
case class EBILinkItem(id: Long, links: String) {}
|
case class EBILinkItem(id: Long, links: String) {}
|
||||||
|
@ -231,7 +229,6 @@ object BioDBToOAF {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||||
s"unresolved::$pid::$pidType"
|
s"unresolved::$pid::$pidType"
|
||||||
}
|
}
|
|
@ -1,8 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.bio
|
package eu.dnetllib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import BioDBToOAF.ScholixResolved
|
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
@ -33,7 +33,6 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
|
|
||||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
database.toUpperCase() match {
|
database.toUpperCase() match {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath)
|
|
@ -1,10 +1,10 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.ebi
|
package eu.dnetllib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.Result
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
|
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
|
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
|
||||||
|
@ -90,10 +90,6 @@ object SparkCreateBaselineDataFrame {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,7 +149,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||||
log.info("isLookupUrl: {}", isLookupUrl)
|
log.info("isLookupUrl: {}", isLookupUrl)
|
||||||
|
@ -175,9 +171,9 @@ object SparkCreateBaselineDataFrame {
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
import spark.implicits._
|
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
val sc = spark.sparkContext
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||||
|
@ -186,7 +182,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
|
|
||||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
|
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline_ftp", 2000)
|
||||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
||||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||||
new PMParser(xml)
|
new PMParser(xml)
|
|
@ -1,8 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.ebi
|
package eu.dnetllib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.http.client.config.RequestConfig
|
import org.apache.http.client.config.RequestConfig
|
||||||
import org.apache.http.client.methods.HttpGet
|
import org.apache.http.client.methods.HttpGet
|
||||||
|
@ -60,6 +60,7 @@ object SparkDownloadEBILinks {
|
||||||
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
val log: Logger = LoggerFactory.getLogger(getClass)
|
|
@ -1,15 +1,14 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.ebi
|
package eu.dnetllib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.graph.bio
|
import eu.dnetllib.dhp.sx.bio.BioDBToOAF
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
|
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem
|
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.sql._
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkEBILinksToOaf {
|
object SparkEBILinksToOaf {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
@ -24,12 +23,12 @@ object SparkEBILinksToOaf {
|
||||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info(s"targetPath -> $targetPath")
|
log.info(s"targetPath -> $targetPath")
|
||||||
|
|
||||||
import spark.implicits._
|
|
||||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||||
|
|
||||||
val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
|
val ebLinks: Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links != null)
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||||
|
|
||||||
public class PMGrant {
|
public class PMGrant {
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed
|
package eu.dnetllib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed;
|
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||||
|
|
||||||
public class PMSubject {
|
public class PMSubject {
|
||||||
private String value;
|
private String value;
|
|
@ -1,11 +1,12 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed
|
package eu.dnetllib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||||
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object PubMedToOaf {
|
object PubMedToOaf {
|
||||||
|
|
||||||
|
@ -71,7 +72,7 @@ object PubMedToOaf {
|
||||||
if (article.getPublicationTypes == null)
|
if (article.getPublicationTypes == null)
|
||||||
return null
|
return null
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
|
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
|
||||||
if (pidList == null)
|
if (pidList == null)
|
||||||
return null
|
return null
|
||||||
|
|
||||||
|
@ -105,7 +106,7 @@ object PubMedToOaf {
|
||||||
if (alternateIdentifier != null)
|
if (alternateIdentifier != null)
|
||||||
i.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
i.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
||||||
result.setInstance(List(i).asJava)
|
result.setInstance(List(i).asJava)
|
||||||
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut)
|
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
|
||||||
val urlLists: List[String] = pidList
|
val urlLists: List[String] = pidList
|
||||||
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
||||||
.filter(t => t._1.nonEmpty)
|
.filter(t => t._1.nonEmpty)
|
||||||
|
@ -136,7 +137,7 @@ object PubMedToOaf {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection breakOut)
|
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
|
||||||
if (subjects != null)
|
if (subjects != null)
|
||||||
result.setSubject(subjects.asJava)
|
result.setSubject(subjects.asJava)
|
||||||
|
|
||||||
|
@ -148,7 +149,7 @@ object PubMedToOaf {
|
||||||
author.setFullname(a.getFullName)
|
author.setFullname(a.getFullName)
|
||||||
author.setRank(index + 1)
|
author.setRank(index + 1)
|
||||||
author
|
author
|
||||||
}(collection breakOut)
|
}(collection.breakOut)
|
||||||
|
|
||||||
|
|
||||||
if (authors != null && authors.nonEmpty)
|
if (authors != null && authors.nonEmpty)
|
|
@ -1,13 +1,9 @@
|
||||||
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>baselineWorkingPath</name>
|
<name>baselineWorkingPath</name>
|
||||||
<description>the Baseline Working Path</description>
|
<description>the Baseline Working Path</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>targetPath</name>
|
|
||||||
<description>the Target Path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>isLookupUrl</name>
|
<name>isLookupUrl</name>
|
||||||
<description>The IS lookUp service endopoint</description>
|
<description>The IS lookUp service endopoint</description>
|
||||||
|
@ -24,8 +20,8 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Convert Baseline to Dataset</name>
|
<name>Convert Baseline to OAF Dataset</name>
|
||||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkCreateBaselineDataFrame</class>
|
<class>eu.dnetllib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
@ -38,9 +34,10 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${baselineWorkingPath}/transformed</arg>
|
||||||
<arg>--master</arg><arg>yarn</arg>
|
<arg>--master</arg><arg>yarn</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||||
|
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
|
@ -1,13 +1,10 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.bio.pubmed
|
package eu.dnetllib.dhp.sx.bio
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved
|
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF
|
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
|
||||||
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
|
|
||||||
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
|
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -55,7 +52,7 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString
|
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||||
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
||||||
}
|
}
|
||||||
|
@ -65,7 +62,7 @@ class BioScholixTest extends AbstractVocabularyTest{
|
||||||
def testPubmedToOaf(): Unit = {
|
def testPubmedToOaf(): Unit = {
|
||||||
assertNotNull(vocabularies)
|
assertNotNull(vocabularies)
|
||||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("pubmed_dump")).mkString
|
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
|
||||||
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
assertEquals(10, r.size)
|
assertEquals(10, r.size)
|
||||||
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
|
@ -208,7 +208,7 @@ object SparkGenerateDoiBoost {
|
||||||
(r.getTarget,r)
|
(r.getTarget,r)
|
||||||
else
|
else
|
||||||
("resolved", r)
|
("resolved", r)
|
||||||
})
|
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
|
||||||
|
|
||||||
val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2)
|
val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2)
|
||||||
|
|
||||||
|
@ -222,7 +222,7 @@ object SparkGenerateDoiBoost {
|
||||||
else
|
else
|
||||||
currentRels.setTarget(currentOrgs._1)
|
currentRels.setTarget(currentOrgs._1)
|
||||||
currentRels
|
currentRels
|
||||||
}.write.save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
}.filter(r=> !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||||
|
|
||||||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
|
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
|
||||||
val affiliation = item._2
|
val affiliation = item._2
|
||||||
|
|
|
@ -84,7 +84,8 @@ public class PropagationConstant {
|
||||||
return di;
|
return di;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Qualifier getQualifier(String inference_class_id, String inference_class_name, String qualifierSchema) {
|
public static Qualifier getQualifier(String inference_class_id, String inference_class_name,
|
||||||
|
String qualifierSchema) {
|
||||||
Qualifier pa = new Qualifier();
|
Qualifier pa = new Qualifier();
|
||||||
pa.setClassid(inference_class_id);
|
pa.setClassid(inference_class_id);
|
||||||
pa.setClassname(inference_class_name);
|
pa.setClassname(inference_class_name);
|
||||||
|
@ -108,7 +109,11 @@ public class PropagationConstant {
|
||||||
r.setRelClass(rel_class);
|
r.setRelClass(rel_class);
|
||||||
r.setRelType(rel_type);
|
r.setRelType(rel_type);
|
||||||
r.setSubRelType(subrel_type);
|
r.setSubRelType(subrel_type);
|
||||||
r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name, ModelConstants.DNET_PROVENANCE_ACTIONS));
|
r
|
||||||
|
.setDataInfo(
|
||||||
|
getDataInfo(
|
||||||
|
inference_provenance, inference_class_id, inference_class_name,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -173,7 +173,10 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
if (toaddpid) {
|
if (toaddpid) {
|
||||||
StructuredProperty p = new StructuredProperty();
|
StructuredProperty p = new StructuredProperty();
|
||||||
p.setValue(autoritative_author.getOrcid());
|
p.setValue(autoritative_author.getOrcid());
|
||||||
p.setQualifier(getQualifier(ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES));
|
p
|
||||||
|
.setQualifier(
|
||||||
|
getQualifier(
|
||||||
|
ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES));
|
||||||
p
|
p
|
||||||
.setDataInfo(
|
.setDataInfo(
|
||||||
getDataInfo(
|
getDataInfo(
|
||||||
|
|
|
@ -10,7 +10,6 @@ import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -22,6 +21,7 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
|
@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -20,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.pangaea
|
package eu.dnetlib.dhp.sx.graph.pangaea
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.sx.graph.ebi.SparkEBILinksToOaf
|
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
|
|
@ -84,13 +84,15 @@ public class IndexRecordTransformerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testForEOSCFutureTraining() throws IOException, TransformerException {
|
public void testForEOSCFutureTraining() throws IOException, TransformerException {
|
||||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml"));
|
final String record = IOUtils
|
||||||
|
.toString(getClass().getResourceAsStream("eosc-future/training-notebooks-seadatanet.xml"));
|
||||||
testRecordTransformation(record);
|
testRecordTransformation(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testForEOSCFutureAirQualityCopernicus() throws IOException, TransformerException {
|
public void testForEOSCFutureAirQualityCopernicus() throws IOException, TransformerException {
|
||||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml"));
|
final String record = IOUtils
|
||||||
|
.toString(getClass().getResourceAsStream("eosc-future/air-quality-copernicus.xml"));
|
||||||
testRecordTransformation(record);
|
testRecordTransformation(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,12 +104,11 @@ public class IndexRecordTransformerTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testForEOSCFutureB2SharePlotRelatedORP() throws IOException, TransformerException {
|
public void testForEOSCFutureB2SharePlotRelatedORP() throws IOException, TransformerException {
|
||||||
final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml"));
|
final String record = IOUtils
|
||||||
|
.toString(getClass().getResourceAsStream("eosc-future/b2share-plot-related-orp.xml"));
|
||||||
testRecordTransformation(record);
|
testRecordTransformation(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
||||||
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
||||||
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
||||||
|
|
Loading…
Reference in New Issue