added needed parameter
This commit is contained in:
parent
3eca5d2e1c
commit
d410ea8a41
|
@ -2,26 +2,19 @@
|
|||
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.bulktag.community.ResultTagger;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
|
@ -54,7 +47,7 @@ public class AppendNewRelations implements Serializable {
|
|||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkHiveSession(
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> appendNewRelation(spark, inputPath, outputPath));
|
||||
|
|
|
@ -5,9 +5,10 @@
|
|||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>sets the outputPath</description>
|
||||
<name>iterations</name>
|
||||
<description>the number of hops to be done up on the hierarchy</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -21,119 +22,26 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
<start to="reset_outputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>
|
||||
<default to="reset_outputpath"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="copy_entities"/>
|
||||
<ok to="prepare_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="copy_entities">
|
||||
<path start="copy_relation"/>
|
||||
<path start="copy_publication"/>
|
||||
<path start="copy_dataset"/>
|
||||
<path start="copy_orp"/>
|
||||
<path start="copy_software"/>
|
||||
<path start="copy_organization"/>
|
||||
<path start="copy_projects"/>
|
||||
<path start="copy_datasources"/>
|
||||
</fork>
|
||||
|
||||
<action name="copy_relation">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_publication">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_dataset">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_orp">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_software">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
||||
<arg>${nameNode}/${outputPath}/software</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_organization">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_projects">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
||||
<arg>${nameNode}/${outputPath}/project</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_datasources">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
||||
</distcp>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="prepare_info"/>
|
||||
|
||||
|
||||
<action name="prepare_info">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResultOrganizationAssociation</name>
|
||||
<name>PrepareResultProjectOrganizationAssociation</name>
|
||||
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo</class>
|
||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -161,7 +69,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>resultToOrganizationFromSemRel</name>
|
||||
<name>resultProjectToOrganizationFromSemRel</name>
|
||||
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel</class>
|
||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -177,7 +85,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${sourcePath}/relation</arg>
|
||||
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
|
||||
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
|
||||
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName":"s",
|
||||
"paramLongName":"sourcePath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the path where prepared info have been stored",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "institutional repositories that should not be considered for the propagation",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -1,5 +1,5 @@
|
|||
sourcePath=/tmp/beta_provision/graph/09_graph_dedup_enriched
|
||||
resumeFrom=default
|
||||
resumeFrom=AffiliationSemanticRelation
|
||||
allowedsemrelsorcidprop=isSupplementedBy;isSupplementTo
|
||||
allowedsemrelsresultproject=isSupplementedBy;isSupplementTo
|
||||
allowedsemrelscommunitysemrel=isSupplementedBy;isSupplementTo
|
||||
|
@ -24,5 +24,5 @@ pathMap ={"author":"$['author'][*]['fullname']", \
|
|||
blacklist=empty
|
||||
allowedpids=orcid;orcid_pending
|
||||
baseURL = https://services.openaire.eu/openaire/community/
|
||||
|
||||
iterations=1
|
||||
|
||||
|
|
|
@ -195,13 +195,13 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="affiliation_semantic_relation" />
|
||||
<ok to="entity_semantic_relation" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<action name="affiliation_semantic_relation">
|
||||
<action name="entity_semantic_relation">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/affiliation_semantic_relation
|
||||
<app-path>${wf:appPath()}/entity_semantic_relation
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
|
@ -209,6 +209,10 @@
|
|||
<name>sourcePath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>iterations</name>
|
||||
<value>${iterations}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="community_organization" />
|
||||
|
|
Loading…
Reference in New Issue