forked from D-Net/dnet-hadoop
implemented last part of workflows to generate scholixGraph
This commit is contained in:
parent
cfde63a7c3
commit
d9e3b89937
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
import com.cloudera.com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
object SparkConvertRDDtoDataset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val entities = List(
|
||||
("dataset", classOf[OafDataset]),
|
||||
("otherresearchproduct", classOf[OtherResearchProduct]),
|
||||
("publication", classOf[Publication]),
|
||||
("software", classOf[Software])
|
||||
)
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
val mapper = new ObjectMapper()
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
entities.foreach{
|
||||
e =>
|
||||
val rdd =spark.sparkContext.textFile(s"$sourcePath/${e._1}").map(s => mapper.readValue(s, e._2))
|
||||
spark.createDataset(rdd).as[Result].write.mode(SaveMode.Overwrite).save(s"$targetPath/${e._1}")
|
||||
}
|
||||
}
|
||||
}
|
|
@ -30,7 +30,7 @@ object SparkResolveRelation {
|
|||
val relationPath = parser.get("relationPath")
|
||||
log.info(s"sourcePath -> $relationPath")
|
||||
val entityPath = parser.get("entityPath")
|
||||
log.info(s"targetPath -> $entityPath")
|
||||
log.info(s"entityPath -> $entityPath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
|
@ -48,8 +48,8 @@ object SparkResolveRelation {
|
|||
m =>
|
||||
val sourceResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (sourceResolved!=null && sourceResolved._2!=null && sourceResolved._2.nonEmpty)
|
||||
currentRelation.setSource(sourceResolved._2)
|
||||
if (sourceResolved!=null && sourceResolved._1!=null && sourceResolved._1.nonEmpty)
|
||||
currentRelation.setSource(sourceResolved._1)
|
||||
currentRelation
|
||||
}.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
@ -61,13 +61,13 @@ object SparkResolveRelation {
|
|||
m =>
|
||||
val targetResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (targetResolved!=null && targetResolved._2.nonEmpty)
|
||||
currentRelation.setTarget(targetResolved._2)
|
||||
if (targetResolved!=null && targetResolved._1.nonEmpty)
|
||||
currentRelation.setTarget(targetResolved._1)
|
||||
currentRelation
|
||||
}.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50"))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relation")
|
||||
.save(s"$workingPath/relation_resolved")
|
||||
}
|
||||
|
||||
|
||||
|
@ -89,16 +89,16 @@ object SparkResolveRelation {
|
|||
|
||||
val d: RDD[(String,String)] = spark.sparkContext.textFile(s"$entityPath/*")
|
||||
.map(i => extractPidsFromRecord(i))
|
||||
.filter(s => s != null && s._2!=null && s._2.nonEmpty)
|
||||
.filter(s => s != null && s._1!= null && s._2!=null && s._2.nonEmpty)
|
||||
.flatMap{ p =>
|
||||
p._2.map(pid =>
|
||||
(p._1,convertPidToDNETIdentifier(pid._1, pid._2))
|
||||
(p._1, convertPidToDNETIdentifier(pid._1, pid._2))
|
||||
)
|
||||
}
|
||||
}.filter(r =>r._1 != null || r._2 != null)
|
||||
|
||||
spark.createDataset(d)
|
||||
.groupByKey(_._1)
|
||||
.reduceGroups((x, y) => if (x._2.startsWith("50|doi") || x._2.startsWith("50|pmid")) x else y)
|
||||
.groupByKey(_._2)
|
||||
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
||||
.map(s => s._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Create Raw Graph Step 1: extract Entities in raw graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Create Scholix final Graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
|
@ -6,48 +6,22 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the graph Raw base path</description>
|
||||
<description>the final graph path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ExtractEntities"/>
|
||||
<start to="ImportDatasetEntities"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ExtractEntities">
|
||||
<action name="ImportDatasetEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract entities in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkCreateInputGraph</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="ResolveRelations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ResolveRelations">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Resolve Relations in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkResolveRelation</class>
|
||||
<name>Import JSONRDD to Dataset kryo</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -60,9 +34,8 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--relationPath</arg><arg>${targetPath}/extracted/relation</arg>
|
||||
<arg>--workingPath</arg><arg>${targetPath}/resolved/</arg>
|
||||
<arg>--entityPath</arg><arg>${targetPath}/dedup</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/entities</arg>
|
||||
</spark>
|
||||
<ok to="CreateSummaries"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -87,7 +60,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${targetPath}/dedup</arg>
|
||||
<arg>--sourcePath</arg><arg>${targetPath}/entities</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||
</spark>
|
||||
<ok to="CreateScholix"/>
|
||||
|
@ -114,7 +87,7 @@
|
|||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
|
||||
<arg>--relationPath</arg><arg>${targetPath}/resolved/resolvedRelation</arg>
|
||||
<arg>--relationPath</arg><arg>${sourcePath}/relation_resolved</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="DropJSONPath"/>
|
||||
|
@ -182,9 +155,5 @@
|
|||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -15,14 +15,24 @@
|
|||
|
||||
</parameters>
|
||||
|
||||
<start to="ResolveRelations"/>
|
||||
<start to="DropRelFolder"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DropRelFolder">
|
||||
<fs>
|
||||
<delete path='${targetPath}/relation'/>
|
||||
<delete path='${targetPath}/relation_resolved'/>
|
||||
<delete path='${targetPath}/resolvedSource'/>
|
||||
<delete path='${targetPath}/resolvedPid'/>
|
||||
|
||||
</fs>
|
||||
<ok to="ResolveRelations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="ResolveRelations">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
|
Loading…
Reference in New Issue