forked from D-Net/dnet-hadoop
added generation of EBI Dataset
This commit is contained in:
parent
dab783b173
commit
1d420eedb4
|
@ -0,0 +1,89 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
|
||||
object EBIAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
|
||||
|
||||
override def zero: OafDataset = new OafDataset()
|
||||
|
||||
override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: OafDataset): OafDataset = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
|
||||
override def outputEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
}
|
||||
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
|
||||
def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
|
||||
|
||||
override def zero: Relation = new Relation()
|
||||
|
||||
override def reduce(b: Relation, a: (String, Relation)): Relation = {
|
||||
a._2
|
||||
}
|
||||
|
||||
|
||||
override def merge(a: Relation, b: Relation): Relation = {
|
||||
if(b!= null) b else a
|
||||
}
|
||||
override def finish(reduction: Relation): Relation = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
|
||||
override def outputEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkCreateEBIDataFrame {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
val relationMapper = RelationMapper.load
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
|
||||
logger.info("Extract Publication and relation from publication_xml")
|
||||
val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new PublicationScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
|
||||
|
||||
logger.info("Extract Publication and relation from dataset_xml")
|
||||
val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/_dataset_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new DatasetScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
|
||||
val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
|
||||
val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
|
||||
val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
|
||||
publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
|
||||
|
||||
dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDatasetAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
|
||||
|
||||
relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
|
||||
}
|
||||
}
|
|
@ -150,6 +150,19 @@ public abstract class AbstractScholexplorerParser {
|
|||
return uk;
|
||||
}
|
||||
|
||||
|
||||
protected Qualifier generateQualifier(final String classId, final String className, final String schemeId, final String schemeName) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classId);
|
||||
q.setClassid(className);
|
||||
q.setSchemeid(schemeId);
|
||||
q.setSchemename(schemeName);
|
||||
return q;
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
protected void generateRelations(
|
||||
RelationMapper relationMapper,
|
||||
Result parsedObject,
|
||||
|
|
|
@ -64,7 +64,6 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
|
||||
final String completionStatus = VtdUtilityParser
|
||||
.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
|
||||
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
@ -149,6 +148,42 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
|
||||
|
||||
String resolvedURL = null;
|
||||
|
||||
switch (currentPid.getQualifier().getClassname().toLowerCase()) {
|
||||
case "uniprot":
|
||||
resolvedURL ="https://www.uniprot.org/uniprot/"+currentPid.getValue();
|
||||
break;
|
||||
case "ena":
|
||||
if (StringUtils.isNotBlank(currentPid.getValue()) && currentPid.getValue().length() > 7)
|
||||
resolvedURL ="https://www.ebi.ac.uk/ena/data/view/"+currentPid.getValue().substring(0,8);
|
||||
break;
|
||||
case "chembl":
|
||||
resolvedURL ="https://www.ebi.ac.uk/chembl/compound_report_card/"+currentPid.getValue();
|
||||
break;
|
||||
|
||||
case "ncbi-n":
|
||||
resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue();
|
||||
break;
|
||||
case "ncbi-p":
|
||||
resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue();
|
||||
break;
|
||||
case "genbank":
|
||||
resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue();
|
||||
break;
|
||||
case "pdb":
|
||||
resolvedURL ="https://www.ncbi.nlm.nih.gov/nuccore/"+currentPid.getValue();
|
||||
break;
|
||||
case "url":
|
||||
resolvedURL =currentPid.getValue();
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
final String sourceId = generateId(
|
||||
currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
|
||||
parsedObject.setId(sourceId);
|
||||
|
@ -251,6 +286,7 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title"));
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
@ -282,6 +318,14 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
if(StringUtils.isNotBlank(resolvedURL)) {
|
||||
Instance i = new Instance();
|
||||
i.setCollectedfrom(parsedObject.getCollectedfrom().get(0));
|
||||
i.setUrl(Collections.singletonList(resolvedURL));
|
||||
parsedObject.setInstance(Collections.singletonList(i));
|
||||
}
|
||||
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
|
|
|
@ -202,6 +202,7 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser
|
|||
t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
st.setQualifier(generateQualifier( "main title","main title", "dnet:dataCite_title","dnet:dataCite_title"));
|
||||
return st;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,38 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,51 @@
|
|||
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="CreateEBIDataSet"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="CreateEBIDataSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create EBI DataSet</name>
|
||||
<class>eu.dnetlib.dhp.sx.ebi.SparkCreateEBIDataFrame</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=1000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
class TestEBITODS {
|
||||
|
||||
|
||||
@Test
|
||||
def testEBI():Unit = {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue