added generation of EBI Dataset

This commit is contained in:
Sandro La Bruzzo 2020-07-10 14:44:50 +02:00
parent 18b9330312
commit a7d3977481
21 changed files with 985 additions and 107 deletions

View File

@ -14,6 +14,37 @@
<description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description> <description>This module contains common schema classes meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
<dependency> <dependency>

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.schema.scholexplorer
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
object OafUtils {
def generateKeyValue(key: String, value: String): KeyValue = {
val kv: KeyValue = new KeyValue()
kv.setKey(key)
kv.setValue(value)
kv.setDataInfo(generateDataInfo("0.9"))
kv
}
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
di
}
def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier(cls, cls, sch, sch)
}
def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
val q: Qualifier = new Qualifier
q.setClassid(classId)
q.setClassname(className)
q.setSchemeid(schemeId)
q.setSchemename(schemeName)
q
}
def asField[T](value: T): Field[T] = {
val tmp = new Field[T]
tmp.setValue(value)
tmp
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
sp.setValue(value)
sp
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
}
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp
}
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
}
}

View File

@ -9,6 +9,37 @@
<artifactId>dhp-graph-mapper</artifactId> <artifactId>dhp-graph-mapper</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
<dependency> <dependency>
@ -61,6 +92,13 @@
<groupId>org.postgresql</groupId> <groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId> <artifactId>postgresql</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.11</artifactId>
<version>3.5.3</version>
</dependency>
</dependencies> </dependencies>

View File

@ -0,0 +1,138 @@
package eu.dnetlib.dhp.sx.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.scholexplorer.relation.RelationMapper
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
object SparkAddLinkUpdates {
val relationMapper = RelationMapper.load
case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
def generatePubmedDLICollectedFrom(): KeyValue = {
OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
}
def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
val pmid :String = input._1
val input_json :String = input._2
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input_json)
val targets:List[EBILinks] = for {
JObject(link) <- json \\ "Category" \\ "Link"
JField("PublicationDate", JString(pubdate)) <- link
JField("RelationshipType", JObject(relationshipType)) <- link
JField("Name", JString(relname)) <- relationshipType
JField("Target", JObject(target)) <- link
JField("Identifier", JObject(identifier)) <- target
JField("ID", JString(tpid)) <- identifier
JField("IDScheme", JString(tpidtype)) <- identifier
JField("IDURL", JString(turl)) <- identifier
JField("Title", JString(title)) <- target
JField("Publisher", JObject(pub)) <- target
JField("Name", JString(publisher)) <- pub
} yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
targets.flatMap(l => {
val relation = new DLIRelation
val inverseRelation = new DLIRelation
val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
val relInfo = relationMapper.get(l.relation.toLowerCase)
val relationSemantic = relInfo.getOriginal
val inverseRelationSemantic = relInfo.getInverse
relation.setSource(dnetPublicationId)
relation.setTarget(targetDnetId)
relation.setRelClass("datacite")
relation.setRelType(relationSemantic)
relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
inverseRelation.setSource(targetDnetId)
inverseRelation.setTarget(dnetPublicationId)
inverseRelation.setRelClass("datacite")
inverseRelation.setRelType(inverseRelationSemantic)
inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
val d = new DLIDataset
d.setId(targetDnetId)
d.setDataInfo(OafUtils.generateDataInfo())
d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
d.setCompletionStatus("complete")
val pi = new ProvenaceInfo
pi.setId("dli_________::europe_pmc__")
pi.setName( "Europe PMC")
pi.setCompletionStatus("complete")
pi.setCollectionMode("collected")
d.setDlicollectedfrom(List(pi).asJava)
d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
d.setPublisher(OafUtils.asField(l.publisher))
d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
d.setDateofacceptance(OafUtils.asField(l.pubdate))
val i = new Instance
i.setCollectedfrom(generatePubmedDLICollectedFrom())
i.setDateofacceptance(d.getDateofacceptance)
i.setUrl(List(l.turl).asJava)
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
d.setInstance(List(i).asJava)
List(relation, inverseRelation, d)
})
}
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val workingPath = parser.get("workingPath")
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
ds.filter(s => s.isInstanceOf)
val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.sx.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
import scala.io.Source
import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sc = spark.sparkContext
val workingPath = parser.get("workingPath")
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
new PMParser(xml)
} ))
ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
}
}

View File

@ -51,7 +51,7 @@ object SparkCreateEBIDataFrame {
spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf") spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
logger.info("Extract Publication and relation from dataset_xml") logger.info("Extract Publication and relation from dataset_xml")
val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/_dataset_xml").map(s => val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
{ {
new ObjectMapper().readValue(s, classOf[String]) new ObjectMapper().readValue(s, classOf[String])
}).flatMap(s => { }).flatMap(s => {
@ -79,5 +79,9 @@ object SparkCreateEBIDataFrame {
.agg(EBIAggregator.getRelationAggregator().toColumn) .agg(EBIAggregator.getRelationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/relation") .write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
} }
} }

View File

@ -0,0 +1,64 @@
package eu.dnetlib.dhp.sx.ebi.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class PMArticle implements Serializable {
private String pmid;
private String date;
private PMJournal journal;
private String title;
private String description;
private List<PMAuthor> authors = new ArrayList<>();
public String getPmid() {
return pmid;
}
public void setPmid(String pmid) {
this.pmid = pmid;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public PMJournal getJournal() {
return journal;
}
public void setJournal(PMJournal journal) {
this.journal = journal;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public List<PMAuthor> getAuthors() {
return authors;
}
public void setAuthors(List<PMAuthor> authors) {
this.authors = authors;
}
}

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.sx.ebi.model;
import java.io.Serializable;
public class PMAuthor implements Serializable {
private String lastName;
private String foreName;
public String getLastName() {
return lastName;
}
public void setLastName(String lastName) {
this.lastName = lastName;
}
public String getForeName() {
return foreName;
}
public void setForeName(String foreName) {
this.foreName = foreName;
}
public String getFullName() {
return String.format("%s, %s", this.foreName, this.lastName);
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.dhp.sx.ebi.model;
import java.io.Serializable;
public class PMJournal implements Serializable {
private String issn;
private String volume;
private String issue;
private String date;
private String title;
public String getIssn() {
return issn;
}
public void setIssn(String issn) {
this.issn = issn;
}
public String getVolume() {
return volume;
}
public void setVolume(String volume) {
this.volume = volume;
}
public String getIssue() {
return issue;
}
public void setIssue(String issue) {
this.issue = issue;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.sx.ebi.model
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentArticle:PMArticle = generateNextArticle()
override def hasNext: Boolean = currentArticle!= null
override def next(): PMArticle = {
val tmp = currentArticle
currentArticle = generateNextArticle()
tmp
}
def generateNextArticle():PMArticle = {
var currentAuthor: PMAuthor = null
var currentJournal: PMJournal = null
var currNode: String = null
var currentYear = "0"
var currentMonth = "01"
var currentDay = "01"
while (xml.hasNext) {
xml.next match {
case EvElemStart(_, label, _, _) =>
currNode = label
label match {
case "PubmedArticle" => currentArticle = new PMArticle
case "Author" => currentAuthor = new PMAuthor
case "Journal" => currentJournal = new PMJournal
case _ =>
}
case EvElemEnd(_, label) =>
label match {
case "PubmedArticle" => return currentArticle
case "Author" => currentArticle.getAuthors.add(currentAuthor)
case "Journal" => currentArticle.setJournal(currentJournal)
case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
case _ =>
}
case EvText(text) =>
if (currNode!= null && text.trim.nonEmpty)
currNode match {
case "ArticleTitle" => {
if (currentArticle.getTitle==null)
currentArticle.setTitle(text.trim)
else
currentArticle.setTitle(currentArticle.getTitle + text.trim)
}
case "AbstractText" => {
if (currentArticle.getDescription==null)
currentArticle.setDescription(text.trim)
else
currentArticle.setDescription(currentArticle.getDescription + text.trim)
}
case "PMID" => currentArticle.setPmid(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim)
case "Year" => currentYear = text.trim
case "Month" => currentMonth = text.trim
case "Day" => currentDay = text.trim
case "Volume" => currentJournal.setVolume( text.trim)
case "Issue" => currentJournal.setIssue (text.trim)
case "LastName" => {
if (currentAuthor != null)
currentAuthor.setLastName(text.trim)
}
case "ForeName" => if (currentAuthor != null)
currentAuthor.setForeName(text.trim)
case "Title" =>
if (currentJournal.getTitle==null)
currentJournal.setTitle(text.trim)
else
currentJournal.setTitle(currentJournal.getTitle + text.trim)
case _ =>
}
case _ =>
}
}
null
}
}

View File

@ -150,8 +150,8 @@ public abstract class AbstractScholexplorerParser {
return uk; return uk;
} }
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId,
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, final String schemeName) { final String schemeName) {
final Qualifier q = new Qualifier(); final Qualifier q = new Qualifier();
q.setClassid(classId); q.setClassid(classId);
q.setClassid(className); q.setClassid(className);
@ -159,8 +159,6 @@ public abstract class AbstractScholexplorerParser {
q.setSchemename(schemeName); q.setSchemename(schemeName);
return q; return q;
} }
protected void generateRelations( protected void generateRelations(

View File

@ -148,7 +148,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
inferPid(currentPid); inferPid(currentPid);
parsedObject.setPid(Collections.singletonList(currentPid)); parsedObject.setPid(Collections.singletonList(currentPid));
String resolvedURL = null; String resolvedURL = null;
switch (currentPid.getQualifier().getClassname().toLowerCase()) { switch (currentPid.getQualifier().getClassname().toLowerCase()) {
@ -180,10 +179,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
break; break;
} }
final String sourceId = generateId( final String sourceId = generateId(
currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
parsedObject.setId(sourceId); parsedObject.setId(sourceId);
@ -286,7 +281,11 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
t -> { t -> {
final StructuredProperty st = new StructuredProperty(); final StructuredProperty st = new StructuredProperty();
st.setValue(t); st.setValue(t);
st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); st
.setQualifier(
generateQualifier(
"main title", "main title", "dnet:dataCite_title",
"dnet:dataCite_title"));
return st; return st;
}) })
.collect(Collectors.toList())); .collect(Collectors.toList()));
@ -318,7 +317,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (StringUtils.isNotBlank(resolvedURL)) { if (StringUtils.isNotBlank(resolvedURL)) {
Instance i = new Instance(); Instance i = new Instance();
i.setCollectedfrom(parsedObject.getCollectedfrom().get(0)); i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));

View File

@ -202,7 +202,11 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
t -> { t -> {
final StructuredProperty st = new StructuredProperty(); final StructuredProperty st = new StructuredProperty();
st.setValue(t); st.setValue(t);
st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title")); st
.setQualifier(
generateQualifier(
"main title", "main title", "dnet:dataCite_title",
"dnet:dataCite_title"));
return st; return st;
}) })
.collect(Collectors.toList())); .collect(Collectors.toList()));

View File

@ -1,4 +1,7 @@
<configuration> <configuration>
<!-- OCEAN -->
<!--
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>
<value>yarnRM</value> <value>yarnRM</value>
@ -7,14 +10,6 @@
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://nameservice1</value> <value>hdfs://nameservice1</value>
</property> </property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property> <property>
<name>hive_metastore_uris</name> <name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value> <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
@ -23,6 +18,41 @@
<name>spark2YarnHistoryServerAddress</name> <name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value> <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property> </property>
-->
<!-- GARR -->
<property>
<name>jobTracker</name>
<value>yarn</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property> <property>
<name>spark2EventLogDir</name> <name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value> <value>/user/spark/spark2ApplicationHistory</value>

View File

@ -18,7 +18,7 @@
</property> </property>
</parameters> </parameters>
<start to="CreateEBIDataSet"/> <start to="GenerateUpdates"/>
<kill name="Kill"> <kill name="Kill">
@ -26,13 +26,59 @@
</kill> </kill>
<action name="GenerateBaselineDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Baselnie DataSet</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateBaselineDataFrame</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=1
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="GenerateUpdates">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Baselnie DataSet</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkAddLinkUpdates</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=1
--driver-memory=${sparkDriverMemory}
--executor-cores=${sparkExecutorCores}
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="CreateEBIDataSet"> <action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Create EBI DataSet</name> <name>Create EBI DataSet</name>
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class> <class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
@ -41,7 +87,7 @@
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.sx.ebi
import org.junit.jupiter.api.Test
class TestEBI {
@Test
def testEBIData() = {
SparkAddLinkUpdates.main("-mt local[*] -w /home/sandro/Downloads".split(" "))
}
}

View File

@ -1,22 +0,0 @@
package eu.dnetlib.dhp.sx.ebi
import org.junit.jupiter.api.Test
class TestEBITODS {
@Test
def testEBI():Unit = {
}
}

View File

@ -0,0 +1,55 @@
{
"Category": [
{
"Section": [
{
"Linklist": {
"Link": [
{
"LinkProvider": {
"Name": "Europe PMC"
},
"Target": {
"Publisher": {
"Name": "Altmetric"
},
"ImageURL": "https://api.altmetric.com/v1/donut/58578459_64.png",
"Identifier": {
"ID": "https://www.altmetric.com/details/58578459",
"IDScheme": "URL",
"IDURL": "https://www.altmetric.com/details/58578459"
},
"Type": {
"Name": "dataset"
},
"Title": "Optical clumped isotope thermometry of carbon dioxide"
},
"Source": {
"Identifier": {
"ID": "30886173",
"IDScheme": "PMID"
},
"Type": {
"Name": "literature"
}
},
"PublicationDate": "06-04-2019",
"RelationshipType": {
"Name": "IsReferencedBy"
},
"ObtainedBy": "ext_links"
}
]
},
"ObtainedBy": "ext_links",
"SectionLinkCount": 1,
"Tags": [
"altmetrics"
]
}
],
"CategoryLinkCount": 1,
"Name": "Altmetric"
}
]
}

View File

@ -0,0 +1,191 @@
{
"version": "6.3",
"hitCount": 4,
"request": {
"id": "28818901",
"source": "MED"
},
"dataLinkList": {
"Category": [
{
"Name": "Nucleotide Sequences",
"CategoryLinkCount": 3,
"Section": [
{
"ObtainedBy": "tm_accession",
"Tags": [
"supporting_data"
],
"SectionLinkCount": 1,
"Linklist": {
"Link": [
{
"ObtainedBy": "tm_accession",
"PublicationDate": "27-02-2020",
"LinkProvider": {
"Name": "Europe PMC"
},
"RelationshipType": {
"Name": "References"
},
"Source": {
"Type": {
"Name": "literature"
},
"Identifier": {
"ID": "28818901",
"IDScheme": "MED"
}
},
"Target": {
"Type": {
"Name": "dataset"
},
"Identifier": {
"ID": "AP008937",
"IDScheme": "ENA",
"IDURL": "http://identifiers.org/ena.embl/AP008937"
},
"Title": "AP008937",
"Publisher": {
"Name": "Europe PMC"
}
},
"Frequency": 1
}
]
}
},
{
"ObtainedBy": "submission",
"Tags": [
"related_data"
],
"SectionLinkCount": 2,
"CollectionURL": "http://www.ebi.ac.uk/ena/data/search?query=28818901",
"Linklist": {
"Link": [
{
"ObtainedBy": "submission",
"PublicationDate": "25-06-2018",
"LinkProvider": {
"Name": "Europe PMC"
},
"RelationshipType": {
"Name": "IsReferencedBy"
},
"Source": {
"Type": {
"Name": "literature"
},
"Identifier": {
"ID": "28818901",
"IDScheme": "PMID"
}
},
"Target": {
"Type": {
"Name": "dataset"
},
"Identifier": {
"ID": "NIWV01000000",
"IDScheme": "ENA",
"IDURL": "http://www.ebi.ac.uk/ena/data/view/NIWV01000000"
},
"Title": "Nucleotide sequences",
"Publisher": {
"Name": "ENA"
}
}
},
{
"ObtainedBy": "submission",
"PublicationDate": "25-06-2018",
"LinkProvider": {
"Name": "Europe PMC"
},
"RelationshipType": {
"Name": "IsReferencedBy"
},
"Source": {
"Type": {
"Name": "literature"
},
"Identifier": {
"ID": "28818901",
"IDScheme": "PMID"
}
},
"Target": {
"Type": {
"Name": "dataset"
},
"Identifier": {
"ID": "PRJNA390617",
"IDScheme": "ENA",
"IDURL": "http://www.ebi.ac.uk/ena/data/view/PRJNA390617"
},
"Title": "Lactobacillus fermentum strain:BFE 6620",
"Publisher": {
"Name": "ENA"
}
}
}
]
}
}
]
},
{
"Name": "BioStudies: supplemental material and supporting data",
"CategoryLinkCount": 1,
"Section": [
{
"ObtainedBy": "ext_links",
"Tags": [
"supporting_data"
],
"SectionLinkCount": 1,
"Linklist": {
"Link": [
{
"ObtainedBy": "ext_links",
"PublicationDate": "24-07-2018",
"LinkProvider": {
"Name": "Europe PMC"
},
"RelationshipType": {
"Name": "IsReferencedBy"
},
"Source": {
"Type": {
"Name": "literature"
},
"Identifier": {
"ID": "28818901",
"IDScheme": "PMID"
}
},
"Target": {
"Type": {
"Name": "dataset"
},
"Identifier": {
"ID": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true",
"IDScheme": "URL",
"IDURL": "http://www.ebi.ac.uk/biostudies/studies/S-EPMC5604774?xr=true"
},
"Title": "Draft Genome Sequence of Lactobacillus fermentum BFE 6620, a Potential Starter Culture for African Vegetable Foods, Isolated from Fermented Cassava.",
"Publisher": {
"Name": "BioStudies: supplemental material and supporting data"
}
}
}
]
}
}
]
}
]
}
}

View File

@ -5,11 +5,12 @@ import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper import org.codehaus.jackson.map.ObjectMapper
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
@ -426,46 +427,6 @@ object DLIToOAF {
} }
def generateKeyValue(key: String, value: String): KeyValue = {
val kv: KeyValue = new KeyValue()
kv.setKey(key)
kv.setValue(value)
kv.setDataInfo(generateDataInfo("0.9"))
kv
}
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
di
}
def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier(cls, cls, sch, sch)
}
def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
val q: Qualifier = new Qualifier
q.setClassid(classId)
q.setClassname(className)
q.setSchemeid(schemeId)
q.setSchemename(schemeName)
q
}
def asField[T](value: T): Field[T] = {
val tmp = new Field[T]
tmp.setValue(value)
tmp
} }
}

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import java.nio.file.Files;
import eu.dnetlib.dhp.schema.oaf.Relation; import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -19,9 +20,10 @@ import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.nio.file.Files;
import java.nio.file.Path; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareRelationsJobTest { public class PrepareRelationsJobTest {
@ -74,14 +76,19 @@ public class PrepareRelationsJobTest {
"-maxRelations", String.valueOf(maxRelations) "-maxRelations", String.valueOf(maxRelations)
}); });
Dataset<Relation> out = spark.read() Dataset<Relation> out = spark
.read()
.parquet(testPath.toString()) .parquet(testPath.toString())
.as(Encoders.bean(Relation.class)) .as(Encoders.bean(Relation.class))
.cache(); .cache();
Assertions.assertEquals(10, out.count()); Assertions.assertEquals(10, out.count());
Dataset<Row> freq = out.toDF().cube(SUBRELTYPE).count().filter((FilterFunction<Row>) value -> !value.isNullAt(0)); Dataset<Row> freq = out
.toDF()
.cube(SUBRELTYPE)
.count()
.filter((FilterFunction<Row>) value -> !value.isNullAt(0));
long outcome = freq.filter(freq.col(SUBRELTYPE).equalTo(OUTCOME)).collectAsList().get(0).getAs("count"); long outcome = freq.filter(freq.col(SUBRELTYPE).equalTo(OUTCOME)).collectAsList().get(0).getAs("count");
long supplement = freq.filter(freq.col(SUBRELTYPE).equalTo(SUPPLEMENT)).collectAsList().get(0).getAs("count"); long supplement = freq.filter(freq.col(SUBRELTYPE).equalTo(SUPPLEMENT)).collectAsList().get(0).getAs("count");