forked from D-Net/dnet-hadoop
added SparkResolveEntities node to the oozie wf
This commit is contained in:
parent
9cb195314f
commit
2ca0a436ad
|
@ -2,9 +2,11 @@ package eu.dnetlib.dhp.oa.graph.resolution
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.HdfsSupport
|
||||
import eu.dnetlib.dhp.schema.common.EntityType
|
||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
@ -34,14 +36,23 @@ object SparkResolveEntities {
|
|||
val unresolvedPath = parser.get("unresolvedPath")
|
||||
log.info(s"unresolvedPath -> $unresolvedPath")
|
||||
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.mkdirs(new Path(workingPath))
|
||||
|
||||
resolveEntities(spark, workingPath, unresolvedPath)
|
||||
|
||||
|
||||
generateResolvedEntities(spark, workingPath, graphBasePath)
|
||||
|
||||
// TO BE conservative we keep the original entities in the working dir
|
||||
// and save the resolved entities on the graphBasePath
|
||||
//In future these lines of code should be removed
|
||||
entities.foreach {
|
||||
e =>
|
||||
fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
|
||||
fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
|
|
@ -4,6 +4,10 @@
|
|||
<name>graphBasePath</name>
|
||||
<description>the path of the graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>unresolvedPath</name>
|
||||
<description>the path of the unresolved Entities</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResolveRelations"/>
|
||||
|
@ -36,5 +40,33 @@
|
|||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResolveEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Resolve Relations in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.resolution.SparkResolveEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=10000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue