mergin with branch beta
commit
2aca6bfa0a
@ -1,41 +0,0 @@
|
||||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object ExportActionSetJobNode {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(ExportActionSetJobNode.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o =>DataciteToOAFTransformation.toActionSet(o))
|
||||
.filter(o => o!= null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object FilterCrossrefEntitiesSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info("sourcePath: {}", sourcePath)
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info("targetPath: {}", targetPath)
|
||||
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
|
||||
val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
|
||||
|
||||
d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package eu.dnetlib.dhp.collection
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||
|
||||
object CollectionUtils {
|
||||
|
||||
/**
|
||||
* This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*
|
||||
*/
|
||||
|
||||
def fixRelations(i: Oaf): List[Oaf] = {
|
||||
if (i.isInstanceOf[OafEntity])
|
||||
return List(i)
|
||||
else {
|
||||
val r: Relation = i.asInstanceOf[Relation]
|
||||
val currentRel = ModelSupport.findRelation(r.getRelClass)
|
||||
if (currentRel != null) {
|
||||
|
||||
// Cleaning relation
|
||||
r.setRelType(currentRel.getRelType)
|
||||
r.setSubRelType(currentRel.getSubReltype)
|
||||
r.setRelClass(currentRel.getRelClass)
|
||||
val inverse = new Relation
|
||||
inverse.setSource(r.getTarget)
|
||||
inverse.setTarget(r.getSource)
|
||||
inverse.setRelType(currentRel.getRelType)
|
||||
inverse.setSubRelType(currentRel.getSubReltype)
|
||||
inverse.setRelClass(currentRel.getInverseRelClass)
|
||||
inverse.setCollectedfrom(r.getCollectedfrom)
|
||||
inverse.setDataInfo(r.getDataInfo)
|
||||
inverse.setProperties(r.getProperties)
|
||||
inverse.setLastupdatetimestamp(r.getLastupdatetimestamp)
|
||||
inverse.setValidated(r.getValidated)
|
||||
inverse.setValidationDate(r.getValidationDate)
|
||||
return List(r, inverse)
|
||||
}
|
||||
}
|
||||
List()
|
||||
}
|
||||
|
||||
}
|
@ -1,12 +1,10 @@
|
||||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
|
||||
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
|
||||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
|
||||
|
||||
import java.io.IOException
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String] {
|
@ -1,7 +1,7 @@
|
||||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
|
||||
|
@ -1,4 +1,4 @@
|
||||
package eu.dnetllib.dhp.sx.bio
|
||||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
@ -1,10 +1,10 @@
|
||||
package eu.dnetllib.dhp.sx.bio.ebi
|
||||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.oaf.Result
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path}
|
@ -1,8 +1,9 @@
|
||||
package eu.dnetllib.dhp.sx.bio.ebi
|
||||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.HttpGet
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
public class PMGrant {
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed;
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
public class PMSubject {
|
||||
private String value;
|
@ -1,4 +1,4 @@
|
||||
package eu.dnetllib.dhp.sx.bio.pubmed
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
@ -1,81 +0,0 @@
|
||||
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>blocksize</name>
|
||||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ImportDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
|
||||
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="TransformJob"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="TransformJob">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>TransformJob</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_oaf</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>false</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,84 +0,0 @@
|
||||
<workflow-app name="Generate_Datacite_and_Crossref_dump_for_Scholexplorer" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>datacitePath</name>
|
||||
<description>the path of Datacite spark dataset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>crossrefPath</name>
|
||||
<description>the path of Crossref spark dataset</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the path of Crossref spark dataset</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ImportDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${datacitePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/datacite_oaf</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>true</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="FilterCrossrefEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="FilterCrossrefEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>FilterCrossrefEntities</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${crossrefPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/crossref_oaf</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,46 +1,52 @@
|
||||
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Collect_Datacite" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path of Datacite ActionSet</description>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>blocksize</name>
|
||||
<value>100</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ExportDataset"/>
|
||||
<start to="ImportDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ExportDataset">
|
||||
<action name="ImportDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExportDataset</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
|
||||
<name>ImportDatacite</name>
|
||||
<class>eu.dnetlib.dhp.datacite.ImportDatacite</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
|
||||
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
<arg>--blocksize</arg><arg>${blocksize}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,126 @@
|
||||
<workflow-app name="transform_Datacite" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreOutputId</name>
|
||||
<description>the identifier of the cleaned MDStore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>NEW_VERSION</arg>
|
||||
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="TransformJob"/>
|
||||
<error to="EndReadRollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="TransformJob">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>TransformJob</name>
|
||||
<class>eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--exportLinks</arg><arg>true</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="CommitVersion">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="EndReadRollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>READ_UNLOCK</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="RollBack"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RollBack">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
|
||||
<arg>--action</arg><arg>ROLLBACK</arg>
|
||||
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
</java>
|
||||
<ok to="Kill"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,51 @@
|
||||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>database</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Working dir path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertDB"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ConvertDB">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Bio DB to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--database</arg><arg>${database}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -1,8 +1,7 @@
|
||||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import com.fasterxml.jackson.databind.SerializationFeature
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.junit.jupiter.api.extension.ExtendWith
|
@ -1,10 +1,10 @@
|
||||
package eu.dnetllib.dhp.sx.bio
|
||||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import eu.dnetllib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetllib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
@ -0,0 +1,136 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* It selects the valid relations among those present in the graph. One relation is valid if it is not deletedbyinference
|
||||
* and if both the source and the target node are present in the graph and are not deleted by inference nor invisible.
|
||||
* To check this I made a view of the ids of all the entities in the graph, and select the relations for which a join exists
|
||||
* with this view for both the source and the target
|
||||
*/
|
||||
|
||||
public class SparkSelectValidRelationsJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectValidRelationsJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectValidRelationsJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectValidRelation(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectValidRelation(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Relation> relation = Utils.readPath(spark, inputPath + "/relation", Relation.class);
|
||||
Dataset<Publication> publication = Utils.readPath(spark, inputPath + "/publication", Publication.class);
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> dataset = Utils
|
||||
.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
Dataset<Software> software = Utils.readPath(spark, inputPath + "/software", Software.class);
|
||||
Dataset<OtherResearchProduct> other = Utils
|
||||
.readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class);
|
||||
Dataset<Organization> organization = Utils.readPath(spark, inputPath + "/organization", Organization.class);
|
||||
Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class);
|
||||
Dataset<Datasource> datasource = Utils.readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||
|
||||
relation.createOrReplaceTempView("relation");
|
||||
publication.createOrReplaceTempView("publication");
|
||||
dataset.createOrReplaceTempView("dataset");
|
||||
other.createOrReplaceTempView("other");
|
||||
software.createOrReplaceTempView("software");
|
||||
organization.createOrReplaceTempView("organization");
|
||||
project.createOrReplaceTempView("project");
|
||||
datasource.createOrReplaceTempView("datasource");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT id " +
|
||||
"FROM publication " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM dataset " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM other " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM software " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM organization " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM project " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM datasource " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false ")
|
||||
.createOrReplaceTempView("identifiers");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT relation.* " +
|
||||
"FROM relation " +
|
||||
"JOIN identifiers i1 " +
|
||||
"ON source = i1.id " +
|
||||
"JOIN identifiers i2 " +
|
||||
"ON target = i2.id " +
|
||||
"WHERE datainfo.deletedbyinference = false")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
|
||||
|
||||
public class MyRuntimeException extends RuntimeException {
|
||||
|
||||
public MyRuntimeException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public MyRuntimeException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public MyRuntimeException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.exceptions;
|
||||
|
||||
public class NoAvailableEntityTypeException extends Exception {
|
||||
public NoAvailableEntityTypeException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public NoAvailableEntityTypeException(
|
||||
final String message,
|
||||
final Throwable cause,
|
||||
final boolean enableSuppression,
|
||||
final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public NoAvailableEntityTypeException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public NoAvailableEntityTypeException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public NoAvailableEntityTypeException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,74 @@
|
||||
package eu.dnetlib.dhp.oa.graph.raw
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.HdfsSupport
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.apache.http.client.methods.HttpGet
|
||||
import org.apache.http.impl.client.HttpClients
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.slf4j.LoggerFactory
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
object CopyHdfsOafSparkApplication {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log = LoggerFactory.getLogger(getClass)
|
||||
val conf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
val spark =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc: SparkContext = spark.sparkContext
|
||||
|
||||
val mdstoreManagerUrl = parser.get("mdstoreManagerUrl")
|
||||
log.info("mdstoreManagerUrl: {}", mdstoreManagerUrl)
|
||||
|
||||
val mdFormat = parser.get("mdFormat")
|
||||
log.info("mdFormat: {}", mdFormat)
|
||||
|
||||
val mdLayout = parser.get("mdLayout")
|
||||
log.info("mdLayout: {}", mdLayout)
|
||||
|
||||
val mdInterpretation = parser.get("mdInterpretation")
|
||||
log.info("mdInterpretation: {}", mdInterpretation)
|
||||
|
||||
val hdfsPath = parser.get("hdfsPath")
|
||||
log.info("hdfsPath: {}", hdfsPath)
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala
|
||||
|
||||
val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList
|
||||
|
||||
if (validPaths.nonEmpty) {
|
||||
val oaf = spark.read.load(validPaths: _*).as[Oaf]
|
||||
val mapper = new ObjectMapper()
|
||||
val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList
|
||||
l.foreach(
|
||||
e =>
|
||||
oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e))
|
||||
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.text(s"$hdfsPath/${e}")
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,153 @@
|
||||
package eu.dnetlib.dhp.oa.graph.resolution
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.HdfsSupport
|
||||
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkResolveRelation {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val graphBasePath = parser.get("graphBasePath")
|
||||
log.info(s"graphBasePath -> $graphBasePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
//CLEANING TEMPORARY FOLDER
|
||||
HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration)
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.mkdirs(new Path(workingPath))
|
||||
|
||||
extractPidResolvedTableFromJsonRDD(spark, graphBasePath, workingPath)
|
||||
|
||||
val mapper: ObjectMapper = new ObjectMapper()
|
||||
|
||||
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
|
||||
|
||||
val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String]
|
||||
.map(s => mapper.readValue(s, classOf[Relation])).as[Relation]
|
||||
.map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
|
||||
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map {
|
||||
m =>
|
||||
val sourceResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty)
|
||||
currentRelation.setSource(sourceResolved._1)
|
||||
currentRelation
|
||||
}.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedSource")
|
||||
|
||||
|
||||
val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation]
|
||||
.map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map {
|
||||
m =>
|
||||
val targetResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (targetResolved != null && targetResolved._1.nonEmpty)
|
||||
currentRelation.setTarget(targetResolved._1)
|
||||
currentRelation
|
||||
}
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relation_resolved")
|
||||
|
||||
|
||||
// TO BE conservative we keep the original relation in the working dir
|
||||
// and save the relation resolved on the graphBasePath
|
||||
//In future this two line of code should be removed
|
||||
|
||||
fs.rename(new Path(s"$graphBasePath/relation"), new Path(s"$workingPath/relation"))
|
||||
|
||||
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
|
||||
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
|
||||
.map(r => mapper.writeValueAsString(r))
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.text(s"$graphBasePath/relation")
|
||||
}
|
||||
|
||||
|
||||
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val id: String = (json \ "id").extract[String]
|
||||
val result: List[(String, String)] = for {
|
||||
JObject(pids) <- json \\ "instance" \ "pid"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JField("qualifier", JObject(qualifier)) <- pids
|
||||
JField("classid", JString(pidType)) <- qualifier
|
||||
} yield (pidValue, pidType)
|
||||
|
||||
val alternateIds: List[(String, String)] = for {
|
||||
JObject(pids) <- json \\ "alternateIdentifier"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JField("qualifier", JObject(qualifier)) <- pids
|
||||
JField("classid", JString(pidType)) <- qualifier
|
||||
} yield (pidValue, pidType)
|
||||
|
||||
(id, result ::: alternateIds)
|
||||
}
|
||||
|
||||
|
||||
private def isRelation(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val source = (json \ "source").extractOrElse[String](null)
|
||||
|
||||
source != null
|
||||
}
|
||||
|
||||
private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
|
||||
import spark.implicits._
|
||||
|
||||
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
|
||||
.filter(i => !isRelation(i))
|
||||
.map(i => extractPidsFromRecord(i))
|
||||
.filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty)
|
||||
.flatMap { p =>
|
||||
p._2.map(pid =>
|
||||
(p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2))
|
||||
)
|
||||
}.filter(r => r._1 != null || r._2 != null)
|
||||
|
||||
spark.createDataset(d)
|
||||
.groupByKey(_._2)
|
||||
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
||||
.map(s => s._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedPid")
|
||||
}
|
||||
|
||||
}
|
@ -1,154 +0,0 @@
|
||||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
object SparkResolveRelation {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val relationPath = parser.get("relationPath")
|
||||
log.info(s"sourcePath -> $relationPath")
|
||||
val entityPath = parser.get("entityPath")
|
||||
log.info(s"entityPath -> $entityPath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
extractPidResolvedTableFromJsonRDD(spark, entityPath, workingPath)
|
||||
|
||||
val mappper = new ObjectMapper()
|
||||
|
||||
val rPid:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String,String)]
|
||||
|
||||
val relationDs:Dataset[(String,Relation)] = spark.read.load(relationPath).as[Relation].map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
|
||||
relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map{
|
||||
m =>
|
||||
val sourceResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (sourceResolved!=null && sourceResolved._1!=null && sourceResolved._1.nonEmpty)
|
||||
currentRelation.setSource(sourceResolved._1)
|
||||
currentRelation
|
||||
}.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedSource")
|
||||
|
||||
|
||||
val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map{
|
||||
m =>
|
||||
val targetResolved = m._2
|
||||
val currentRelation = m._1._2
|
||||
if (targetResolved!=null && targetResolved._1.nonEmpty)
|
||||
currentRelation.setTarget(targetResolved._1)
|
||||
currentRelation
|
||||
}.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50"))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relation_resolved")
|
||||
|
||||
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
|
||||
.map(r => mappper.writeValueAsString(r))
|
||||
.rdd.saveAsTextFile(s"$workingPath/relation", classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
|
||||
def extractPidsFromRecord(input:String):(String,List[(String,String)]) = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val id:String = (json \ "id").extract[String]
|
||||
val result: List[(String,String)] = for {
|
||||
JObject(pids) <- json \ "pid"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JField("qualifier", JObject(qualifier)) <- pids
|
||||
JField("classname", JString(pidType)) <- qualifier
|
||||
} yield (pidValue, pidType)
|
||||
|
||||
val alternateIds: List[(String,String)] = for {
|
||||
JObject(pids) <- json \\ "alternateIdentifier"
|
||||
JField("value", JString(pidValue)) <- pids
|
||||
JField("qualifier", JObject(qualifier)) <- pids
|
||||
JField("classname", JString(pidType)) <- qualifier
|
||||
} yield (pidValue, pidType)
|
||||
|
||||
(id,result:::alternateIds)
|
||||
}
|
||||
|
||||
private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = {
|
||||
import spark.implicits._
|
||||
|
||||
val d: RDD[(String,String)] = spark.sparkContext.textFile(s"$entityPath/*")
|
||||
.map(i => extractPidsFromRecord(i))
|
||||
.filter(s => s != null && s._1!= null && s._2!=null && s._2.nonEmpty)
|
||||
.flatMap{ p =>
|
||||
p._2.map(pid =>
|
||||
(p._1, convertPidToDNETIdentifier(pid._1, pid._2))
|
||||
)
|
||||
}.filter(r =>r._1 != null || r._2 != null)
|
||||
|
||||
spark.createDataset(d)
|
||||
.groupByKey(_._2)
|
||||
.reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y)
|
||||
.map(s => s._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedPid")
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
This method should be used once we finally convert everythings in Kryo dataset
|
||||
instead of using rdd of json
|
||||
*/
|
||||
private def extractPidResolvedTableFromKryo(spark: SparkSession, entityPath: String, workingPath: String) = {
|
||||
import spark.implicits._
|
||||
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
val entities: Dataset[Result] = spark.read.load(s"$entityPath/*").as[Result]
|
||||
entities.flatMap(e => e.getPid.asScala
|
||||
.map(p =>
|
||||
convertPidToDNETIdentifier(p.getValue, p.getQualifier.getClassid))
|
||||
.filter(s => s != null)
|
||||
.map(s => (s, e.getId))
|
||||
).groupByKey(_._1)
|
||||
.reduceGroups((x, y) => if (x._2.startsWith("50|doi") || x._2.startsWith("50|pmid")) x else y)
|
||||
.map(s => s._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/relationResolvedPid")
|
||||
}
|
||||
|
||||
def convertPidToDNETIdentifier(pid:String, pidType: String):String = {
|
||||
if (pid==null || pid.isEmpty || pidType== null || pidType.isEmpty)
|
||||
null
|
||||
else
|
||||
s"unresolved::${pid.toLowerCase}::${pidType.toLowerCase}"
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
[
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "mdstoreManagerUrl",
|
||||
"paramDescription": "the MdstoreManager url",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "mdFormat",
|
||||
"paramDescription": "metadata format",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "l",
|
||||
"paramLongName": "mdLayout",
|
||||
"paramDescription": "metadata layout",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be yarn or local",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "mdInterpretation",
|
||||
"paramDescription": "metadata interpretation",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -0,0 +1,30 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,4 @@
|
||||
## This is a classpath-based import file (this header is required)
|
||||
dump_complete classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/complete/oozie_app
|
||||
dump_funder classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app
|
||||
dump_community classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app
|
@ -0,0 +1,306 @@
|
||||
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>singleDeposition</name>
|
||||
<description>Indicates if it is a single community deposition</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityId</name>
|
||||
<description>the id of the community to be dumped if a dump for a single community should be done</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dumpType</name>
|
||||
<description>the type of the dump one of {complete, community, funder}</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>onlyUpload</name>
|
||||
<description>true if the dump is already done and should only be upload in zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>upload</name>
|
||||
<description>true if the dump should be upload in zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookUpUrl</name>
|
||||
<description>the isLookup service endpoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resultAggregation</name>
|
||||
<description>true if all the result type have to be dumped under result. false otherwise</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>accessToken</name>
|
||||
<description>the access token used for the deposition in Zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>connectionUrl</name>
|
||||
<description>the connection url for Zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>metadata</name>
|
||||
<description> the metadata associated to the deposition</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>depositionType</name>
|
||||
<description>the type of deposition we want to perform. "new" for brand new deposition, "version" for a new version of a published deposition (in this case the concept record id must be provided), "upload" to upload content to an open deposition for which we already have the deposition id (in this case the deposition id should be provided)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>conceptRecordId</name>
|
||||
<description>for new version, the id of the record for the old deposition</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>depositionId</name>
|
||||
<description>the depositionId of a deposition open that has to be added content</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>organizationCommunityMap</name>
|
||||
<description>the organization community map</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="only_upload"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="only_upload">
|
||||
<switch>
|
||||
<case to="send_zenodo">${wf:conf('onlyUpload') eq true}</case>
|
||||
<default to="reset_outputpath"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</fs>
|
||||
<ok to="save_community_map"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="save_community_map">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap</main-class>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/communityMap</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--singleDeposition</arg><arg>${singleDeposition}</arg>
|
||||
<arg>--communityId</arg><arg>${communityId}</arg>
|
||||
</java>
|
||||
<ok to="choose_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="choose_dump">
|
||||
<switch>
|
||||
<case to="dump_funder">${wf:conf('dumpType') eq "funder"}</case>
|
||||
<case to="dump_community">${wf:conf('dumpType') eq "community"}</case>
|
||||
<default to="dump_complete"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<!-- Sub-workflow which runs the dump for the complete graph -->
|
||||
<action name="dump_complete">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_complete
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<value>${workingDir}/communityMap</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}/tar</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>organizationCommunityMap</name>
|
||||
<value>${organizationCommunityMap}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookUpUrl</name>
|
||||
<value>${isLookUpUrl}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>resultAggregation</name>
|
||||
<value>${resultAggregation}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="make_archive" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<!-- Sub-workflow which runs the dump for the complete graph -->
|
||||
<action name="dump_community">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_community
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<value>${workingDir}/communityMap</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}/tar</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="make_archive" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<action name="dump_funder">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_funder
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<value>${workingDir}/communityMap</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}/tar</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>dumpType</name>
|
||||
<value>${dumpType}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="make_archive" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<action name="make_archive">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/tar</arg>
|
||||
</java>
|
||||
<ok to="should_upload"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="should_upload">
|
||||
<switch>
|
||||
<case to="send_zenodo">${wf:conf('upload') eq true}</case>
|
||||
<default to="End"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="send_zenodo">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--accessToken</arg><arg>${accessToken}</arg>
|
||||
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
|
||||
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
|
||||
<arg>--depositionType</arg><arg>${depositionType}</arg>
|
||||
<arg>--depositionId</arg><arg>${depositionId}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -0,0 +1,347 @@
|
||||
<workflow-app name="sub_dump_community_funder_results" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<description>the path to the community map</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>selectedResults</name>
|
||||
<description>the path the the possible subset ot results to be dumped</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="fork_dump"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
|
||||
|
||||
<fork name="fork_dump">
|
||||
<path start="dump_publication"/>
|
||||
<path start="dump_dataset"/>
|
||||
<path start="dump_orp"/>
|
||||
<path start="dump_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="dump_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table publication for community/funder related products</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${selectedResults}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dump/publication</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
<arg>--dumpType</arg><arg>${dumpType}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table dataset for community/funder related products</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${selectedResults}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dump/dataset</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table ORP for community related products</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${selectedResults}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table software for community related products</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${selectedResults}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dump/software</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_dump" to="prepareResultProject"/>
|
||||
|
||||
<action name="prepareResultProject">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Prepare association result subset of project info</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkPrepareResultProject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="fork_extendWithProject"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_extendWithProject">
|
||||
<path start="extend_publication"/>
|
||||
<path start="extend_dataset"/>
|
||||
<path start="extend_orp"/>
|
||||
<path start="extend_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="extend_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extend dumped publications with information about project</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/dump/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/ext/publication</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="join_extend"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extend_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extend dumped dataset with information about project</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/dump/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/ext/dataset</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="join_extend"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extend_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extend dumped ORP with information about project</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/dump/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/ext/orp</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="join_extend"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extend_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extend dumped software with information about project</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkUpdateProjectInfo</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/dump/software</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/ext/software</arg>
|
||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||
</spark>
|
||||
<ok to="join_extend"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<join name="join_extend" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
||||
|
||||
|
@ -0,0 +1,30 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,2 @@
|
||||
## This is a classpath-based import file (this header is required)
|
||||
dump_common classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app
|
@ -0,0 +1,145 @@
|
||||
<workflow-app name="sub_dump_community_products" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="common_action_community_funder"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="common_action_community_funder">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_common
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>selectedResults</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<value>${workingDir}/communityMap</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="splitForCommunities" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<action name="splitForCommunities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Split dumped result for community</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.community.SparkSplitForCommunity</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -0,0 +1,30 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,537 @@
|
||||
<workflow-app name="sub-dump_complete" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>resultAggregation</name>
|
||||
<description>true if all the result type have to be dumped under result. false otherwise</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>organizationCommunityMap</name>
|
||||
<description>the organization community map</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="fork_dump" />
|
||||
|
||||
|
||||
|
||||
<fork name="fork_dump">
|
||||
<path start="dump_publication"/>
|
||||
<path start="dump_dataset"/>
|
||||
<path start="dump_orp"/>
|
||||
<path start="dump_software"/>
|
||||
<path start="dump_organization"/>
|
||||
<path start="dump_project"/>
|
||||
<path start="dump_datasource"/>
|
||||
<path start="select_relation"/>
|
||||
</fork>
|
||||
|
||||
<action name="dump_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table publication </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/publication</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table dataset </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/dataset</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table ORP </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table software </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/software</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_organization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table organization </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/organization</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/organization</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table project </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/project</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_datasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table datasource </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/datasource</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/datasource</arg>
|
||||
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Select valid table relation </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/validrelation</arg>
|
||||
</spark>
|
||||
<ok to="dump_relation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table relation </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/validrelation</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/relation</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_dump" to="fork_context"/>
|
||||
|
||||
<fork name="fork_context">
|
||||
<path start="create_entities_fromcontext"/>
|
||||
<path start="create_relation_fromcontext"/>
|
||||
<path start="create_relation_fromorgs"/>
|
||||
</fork>
|
||||
|
||||
<action name="create_entities_fromcontext">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${outputPath}/communities_infrastructures/communities_infrastructure.json.gz</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
</java>
|
||||
<ok to="join_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="create_relation_fromcontext">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${workingDir}/relation/context</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
</java>
|
||||
<ok to="join_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="create_relation_fromorgs">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table relation </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/contextOrg</arg>
|
||||
<arg>--organizationCommunityMap</arg><arg>${organizationCommunityMap}</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_context"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_context" to="fork_extract_relations"/>
|
||||
|
||||
<fork name="fork_extract_relations">
|
||||
<path start="rels_from_pubs"/>
|
||||
<path start="rels_from_dats"/>
|
||||
<path start="rels_from_orp"/>
|
||||
<path start="rels_from_sw"/>
|
||||
</fork>
|
||||
|
||||
<action name="rels_from_pubs">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract Relations from publication </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/publication</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_extract_relations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="rels_from_dats">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table dataset </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/dataset</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_extract_relations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="rels_from_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table ORP </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/orp</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_extract_relations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="rels_from_sw">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump table software </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/software</arg>
|
||||
<arg>--communityMapPath</arg><arg>${communityMapPath}</arg>
|
||||
</spark>
|
||||
<ok to="join_extract_relations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_extract_relations" to="collect_and_save"/>
|
||||
|
||||
<action name="collect_and_save">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Collect Results and Relations and put them in the right path </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--resultAggregation</arg><arg>${resultAggregation}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Sub-workflow dump complete failed with error message ${wf:errorMessage()}
|
||||
</message>
|
||||
</kill>
|
||||
|
||||
<end name="End" />
|
||||
</workflow-app>
|
@ -0,0 +1,30 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,2 @@
|
||||
## This is a classpath-based import file (this header is required)
|
||||
dump_common classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/commoncommunityfunder/oozie_app
|
@ -0,0 +1,256 @@
|
||||
<workflow-app name="sub_dump_funder_results" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="fork_result_linked_to_projects"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<fork name="fork_result_linked_to_projects">
|
||||
<path start="select_publication_linked_to_projects"/>
|
||||
<path start="select_dataset_linked_to_projects"/>
|
||||
<path start="select_orp_linked_to_project"/>
|
||||
<path start="select_software_linked_to_projects"/>
|
||||
</fork>
|
||||
|
||||
<action name="select_publication_linked_to_projects">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump funder results </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/publication</arg>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_link"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_dataset_linked_to_projects">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump funder results </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/dataset</arg>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_link"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_orp_linked_to_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump funder results </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/otherresearchproduct</arg>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_link"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_software_linked_to_projects">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump funder results </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkResultLinkedToProject</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/result/software</arg>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_link"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_link" to="common_action_community_funder"/>
|
||||
|
||||
<action name="common_action_community_funder">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_common
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>selectedResults</name>
|
||||
<value>${workingDir}/result</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityMapPath</name>
|
||||
<value>${workingDir}/communityMap</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="dump_funder_results" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<action name="dump_funder_results">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Dump funder results </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.funderresults.SparkDumpFunderResults</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/ext</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -1,59 +1,37 @@
|
||||
<workflow-app name="Resolve Relation" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>entityPath</name>
|
||||
<description>the path of deduplicate Entities</description>
|
||||
<name>graphBasePath</name>
|
||||
<description>the path of the graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>relationPath</name>
|
||||
<description>the path of relation unresolved</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the path of relation unresolved</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="DropRelFolder"/>
|
||||
<start to="ResolveRelations"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DropRelFolder">
|
||||
<fs>
|
||||
<delete path='${targetPath}/relation'/>
|
||||
<delete path='${targetPath}/relation_resolved'/>
|
||||
<delete path='${targetPath}/resolvedSource'/>
|
||||
<delete path='${targetPath}/resolvedPid'/>
|
||||
|
||||
</fs>
|
||||
<ok to="ResolveRelations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="ResolveRelations">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Resolve Relations in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkResolveRelation</class>
|
||||
<class>eu.dnetlib.dhp.oa.graph.resolution.SparkResolveRelation</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--relationPath</arg><arg>${relationPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entityPath</arg><arg>${entityPath}</arg>
|
||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
@ -1,6 +1,5 @@
|
||||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"r", "paramLongName":"relationPath", "paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"e", "paramLongName":"entityPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
|
||||
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true}
|
||||
]
|
@ -0,0 +1,13 @@
|
||||
SELECT
|
||||
id1 AS source,
|
||||
id2 AS target,
|
||||
reltype AS type,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.95 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction
|
||||
FROM relationships
|
||||
WHERE reltype = 'Child' OR reltype = 'Parent'
|
@ -1,177 +0,0 @@
|
||||
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>PDBPath</name>
|
||||
<description>the PDB Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>UNIPROTDBPath</name>
|
||||
<description>the UNIPROT Database Working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>EBIDataset</name>
|
||||
<description>the EBI Links Dataset Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>ScholixResolvedDBPath</name>
|
||||
<description>the Scholix Resolved Dataset Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>CrossrefLinksPath</name>
|
||||
<description>the CrossrefLinks Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the Target Working dir path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ConvertPDB"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ConvertPDB">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert PDB to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${PDBPath}</arg>
|
||||
<arg>--database</arg><arg>PDB</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/pdb_OAF</arg>
|
||||
</spark>
|
||||
<ok to="ConvertUNIPROT"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertUNIPROT">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert UNIPROT to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${UNIPROTDBPath}</arg>
|
||||
<arg>--database</arg><arg>UNIPROT</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/uniprot_OAF</arg>
|
||||
</spark>
|
||||
<ok to="ConvertEBILinks"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertEBILinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert EBI Links to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkEBILinksToOaf</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${EBIDataset}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/ebi_OAF</arg>
|
||||
</spark>
|
||||
<ok to="ConvertScholixResolved"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertScholixResolved">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Scholix to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${ScholixResolvedDBPath}</arg>
|
||||
<arg>--database</arg><arg>SCHOLIX</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/scholix_resolved_OAF</arg>
|
||||
</spark>
|
||||
<ok to="ConvertCrossrefLinks"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertCrossrefLinks">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Crossref Links to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.bio.SparkTransformBioDatabaseToOAF</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--dbPath</arg><arg>${CrossrefLinksPath}</arg>
|
||||
<arg>--database</arg><arg>CROSSREF_LINKS</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/crossref_unresolved_relation_OAF</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue