[Enrichment single step] modification of workflow ans some change in the classes
This commit is contained in:
parent
b0969461f8
commit
de9d0ace38
|
@ -8,6 +8,8 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
|
|
|
@ -7,12 +7,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
|
|
|
@ -9,6 +9,8 @@ import java.util.List;
|
|||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.countrypropagation;
|
||||
package eu.dnetlib.dhp.countrypropagation.pojo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.countrypropagation;
|
||||
package eu.dnetlib.dhp.countrypropagation.pojo;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.countrypropagation;
|
||||
package eu.dnetlib.dhp.countrypropagation.pojo;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
|
@ -51,8 +51,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
|||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||
|
||||
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
|
||||
log.info("saveGraph: {}", saveGraph);
|
||||
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
|
@ -60,11 +59,9 @@ public class SparkResultToProjectThroughSemRelJob {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
if (isTest(parser)) {
|
||||
removeOutputDir(spark, outputPath);
|
||||
}
|
||||
|
||||
execPropagation(
|
||||
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
|
||||
spark, outputPath, alreadyLinkedPath, potentialUpdatePath);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -72,13 +69,12 @@ public class SparkResultToProjectThroughSemRelJob {
|
|||
SparkSession spark,
|
||||
String outputPath,
|
||||
String alreadyLinkedPath,
|
||||
String potentialUpdatePath,
|
||||
Boolean saveGraph) {
|
||||
String potentialUpdatePath) {
|
||||
|
||||
Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
|
||||
Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
|
||||
|
||||
if (saveGraph) {
|
||||
|
||||
toaddrelations
|
||||
.joinWith(
|
||||
alreadyLinked,
|
||||
|
@ -89,7 +85,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
|||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() {
|
||||
|
|
|
@ -56,11 +56,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
|||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
final Boolean saveGraph = Optional
|
||||
.ofNullable(parser.get("saveGraph"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("saveGraph: {}", saveGraph);
|
||||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||
|
@ -72,10 +68,9 @@ public class SparkResultToCommunityFromOrganizationJob {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
if (saveGraph) {
|
||||
|
||||
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -70,13 +70,10 @@ public class SparkResultToCommunityThroughSemRelJob {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
if (isTest(parser)) {
|
||||
removeOutputDir(spark, outputPath);
|
||||
}
|
||||
if (saveGraph) {
|
||||
|
||||
execPropagation(
|
||||
spark, inputPath, outputPath, preparedInfoPath, resultClazz);
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -98,13 +98,13 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
|||
String leavesPath,
|
||||
String childParentPath,
|
||||
String resultOrganizationPath,
|
||||
String graphPath,
|
||||
String relationPath,
|
||||
String workingPath,
|
||||
String outputPath,
|
||||
int iterations) {
|
||||
if (iterations == 1) {
|
||||
doPropagateOnce(
|
||||
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath,
|
||||
spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
|
||||
workingPath, outputPath);
|
||||
} else {
|
||||
|
||||
|
@ -123,26 +123,26 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
|||
notReachedFirstParent);
|
||||
|
||||
doPropagate(
|
||||
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath,
|
||||
spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
|
||||
workingPath, outputPath, propagationCounter);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath,
|
||||
String resultOrganizationPath, String graphPath, String workingPath,
|
||||
String resultOrganizationPath, String relationPath, String workingPath,
|
||||
String outputPath) {
|
||||
|
||||
StepActions
|
||||
.execStep(
|
||||
spark, graphPath, workingPath + NEW_RELATION_PATH,
|
||||
spark, relationPath, workingPath + NEW_RELATION_PATH,
|
||||
leavesPath, childParentPath, resultOrganizationPath);
|
||||
|
||||
addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath);
|
||||
}
|
||||
|
||||
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
|
||||
String resultOrganizationPath, String graphPath, String workingPath, String outputPath,
|
||||
String resultOrganizationPath, String relationPath, String workingPath, String outputPath,
|
||||
PropagationCounter propagationCounter) {
|
||||
int iteration = 0;
|
||||
long leavesCount;
|
||||
|
@ -151,7 +151,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
|||
iteration++;
|
||||
StepActions
|
||||
.execStep(
|
||||
spark, graphPath, workingPath + NEW_RELATION_PATH,
|
||||
spark, relationPath, workingPath + NEW_RELATION_PATH,
|
||||
leavesPath, childParentPath, resultOrganizationPath);
|
||||
StepActions
|
||||
.prepareForNextStep(
|
||||
|
|
|
@ -27,10 +27,10 @@ import scala.Tuple2;
|
|||
public class StepActions implements Serializable {
|
||||
|
||||
public static void execStep(SparkSession spark,
|
||||
String graphPath, String newRelationPath,
|
||||
String relationPath, String newRelationPath,
|
||||
String leavesPath, String chldParentOrgPath, String resultOrgPath) {
|
||||
|
||||
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class);
|
||||
Dataset<Relation> relationGraph = readPath(spark, relationPath, Relation.class);
|
||||
// select only the relation source target among those proposed by propagation that are not already existent
|
||||
getNewRels(
|
||||
newRelationPath, relationGraph,
|
||||
|
|
|
@ -176,10 +176,6 @@
|
|||
<name>pathMap</name>
|
||||
<value>${pathMap}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}/results</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="affiliation_inst_repo" />
|
||||
|
@ -220,10 +216,6 @@
|
|||
<name>sourcePath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<!-- <property>-->
|
||||
<!-- <name>outputPath</name>-->
|
||||
<!-- <value>${outputPath}</value>-->
|
||||
<!-- </property>-->
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="community_organization" />
|
||||
|
@ -240,10 +232,6 @@
|
|||
<name>sourcePath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>organizationtoresultcommunitymap</name>
|
||||
<value>${organizationtoresultcommunitymap}</value>
|
||||
|
@ -264,10 +252,6 @@
|
|||
<name>sourcePath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>allowedsemrels</name>
|
||||
<value>${allowedsemrelsresultproject}</value>
|
||||
|
@ -280,7 +264,7 @@
|
|||
|
||||
<action name="community_sem_rel">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/result_project
|
||||
<app-path>${wf:appPath()}/community_sem_rel
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
|
@ -288,10 +272,7 @@
|
|||
<name>sourcePath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${workingDir}/communitysemrel</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>allowedsemrels</name>
|
||||
<value>${allowedsemrelscommunitysemrel}</value>
|
||||
|
|
|
@ -186,7 +186,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/bulktag</arg>
|
||||
</spark>
|
||||
<ok to="eosc_get_datasource_master"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -230,7 +230,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/bulktag/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
|
||||
</spark>
|
||||
|
@ -256,7 +256,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/bulktag/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
|
||||
</spark>
|
||||
|
@ -281,7 +281,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/bulktag/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
|
||||
</spark>
|
||||
|
@ -306,14 +306,24 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/otherresearchproduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/bulktag/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
|
||||
</spark>
|
||||
<ok to="wait_eosc_context_tag"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<join name="wait_eosc_context_tag" to="End"/>
|
||||
<join name="wait_eosc_context_tag" to="reset_workingDir"/>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -30,20 +30,13 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="reset_outputpath"/>
|
||||
<start to="prepare_datasource_country_association"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</fs>
|
||||
<ok to="prepare_datasource_country_association"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepare_datasource_country_association">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
@ -222,7 +215,7 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/country/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/country/result/publication</arg>
|
||||
</spark>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -251,7 +244,7 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/country/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/country/result/dataset</arg>
|
||||
</spark>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -280,7 +273,7 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/country/result/otherresearchproduct</arg>
|
||||
</spark>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -309,14 +302,21 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/country/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/country/result/software</arg>
|
||||
</spark>
|
||||
<ok to="wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait" to="End"/>
|
||||
|
||||
<join name="wait" to="reset_workingDir"/>
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -390,7 +390,16 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait2" to="End"/>
|
||||
<join name="wait2" to="reset_workingDir"/>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
|
|
|
@ -8,10 +8,7 @@
|
|||
<name>allowedsemrels</name>
|
||||
<description>the allowed semantics </description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -76,16 +73,22 @@
|
|||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||
</spark-opts>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${sourcePath}/relation</arg>
|
||||
<arg>--potentialUpdatePath</arg><arg>${workingDir}/resultproject/preparedInfo/potentialUpdates</arg>
|
||||
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/resultproject/preparedInfo/alreadyLinked</arg>
|
||||
</spark>
|
||||
<ok to="reset_workingDir"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -8,10 +8,7 @@
|
|||
<name>organizationtoresultcommunitymap</name>
|
||||
<description>organization community map</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -25,21 +22,12 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="reset_outputpath"/>
|
||||
<start to="prepare_result_communitylist"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</fs>
|
||||
<ok to="prepare_result_communitylist"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_result_communitylist">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -97,7 +85,7 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/publication</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -126,7 +114,7 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/dataset</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -155,7 +143,7 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/otherresearchproduct</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -184,14 +172,22 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/software</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait2" to="End"/>
|
||||
<join name="wait2" to="reset_workingDir"/>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -18,21 +18,12 @@
|
|||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="reset_outputpath"/>
|
||||
<start to="fork_prepare_assoc_step1"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</fs>
|
||||
<ok to="fork_prepare_assoc_step1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<fork name="fork_prepare_assoc_step1">
|
||||
|
@ -214,8 +205,8 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/publication</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -243,8 +234,8 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/dataset</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -272,8 +263,8 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/otherresearchproduct</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -301,15 +292,22 @@
|
|||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/software</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait2" to="End"/>
|
||||
|
||||
<join name="wait2" to="reset_workingDir"/>
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -4,10 +4,7 @@
|
|||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<!-- <property>-->
|
||||
<!-- <name>outputPath</name>-->
|
||||
<!-- <description>sets the outputPath</description>-->
|
||||
<!-- </property>-->
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -27,23 +24,6 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<!-- <decision name="resume_from">-->
|
||||
<!-- <switch>-->
|
||||
<!-- <case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>-->
|
||||
<!-- <default to="reset_outputpath"/> <!– first action to be done when downloadDump is to be performed –>-->
|
||||
<!-- </switch>-->
|
||||
<!-- </decision>-->
|
||||
|
||||
<!-- <action name="reset_outputpath">-->
|
||||
<!-- <fs>-->
|
||||
<!-- <delete path="${outputPath}"/>-->
|
||||
<!-- <mkdir path="${outputPath}"/>-->
|
||||
<!-- </fs>-->
|
||||
<!-- <ok to="prepare_info"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
|
||||
<action name="prepare_info">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -99,12 +79,19 @@
|
|||
<arg>--workingDir</arg><arg>${workingDir}/affiliationSemanticRelation/working</arg>
|
||||
<arg>--iterations</arg><arg>${iterations}</arg>
|
||||
</spark>
|
||||
<ok to="reset_workingDir"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -5,6 +5,7 @@ import java.io.IOException;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.countrypropagation;
|
||||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
|
|
@ -33,32 +33,32 @@ public class ProjectPropagationJobTest {
|
|||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final SparkConf conf = new SparkConf();
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
|
||||
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
conf.setAppName(ProjectPropagationJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ProjectPropagationJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
.builder()
|
||||
.appName(ProjectPropagationJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
|
@ -71,6 +71,7 @@ public class ProjectPropagationJobTest {
|
|||
@Test
|
||||
void NoUpdateTest() throws Exception {
|
||||
|
||||
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
|
||||
final String potentialUpdateDate = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
|
||||
|
@ -82,10 +83,10 @@ public class ProjectPropagationJobTest {
|
|||
SparkResultToProjectThroughSemRelJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-hive_metastore_uris", "",
|
||||
"-saveGraph", "true",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/relation",
|
||||
"-potentialUpdatePath", potentialUpdateDate,
|
||||
"-alreadyLinkedPath", alreadyLinkedPath,
|
||||
|
@ -98,6 +99,10 @@ public class ProjectPropagationJobTest {
|
|||
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
|
||||
|
||||
Assertions.assertEquals(0, tmp.count());
|
||||
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -107,6 +112,12 @@ public class ProjectPropagationJobTest {
|
|||
*/
|
||||
@Test
|
||||
void UpdateTenTest() throws Exception {
|
||||
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ProjectPropagationJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
final String potentialUpdatePath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
|
||||
|
@ -118,10 +129,10 @@ public class ProjectPropagationJobTest {
|
|||
SparkResultToProjectThroughSemRelJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-hive_metastore_uris", "",
|
||||
"-saveGraph", "true",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/relation",
|
||||
"-potentialUpdatePath", potentialUpdatePath,
|
||||
"-alreadyLinkedPath", alreadyLinkedPath,
|
||||
|
@ -169,6 +180,9 @@ public class ProjectPropagationJobTest {
|
|||
.sql(
|
||||
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
|
||||
.count());
|
||||
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -179,6 +193,12 @@ public class ProjectPropagationJobTest {
|
|||
*/
|
||||
@Test
|
||||
void UpdateMixTest() throws Exception {
|
||||
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ProjectPropagationJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
final String potentialUpdatepath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
|
||||
|
@ -190,10 +210,10 @@ public class ProjectPropagationJobTest {
|
|||
SparkResultToProjectThroughSemRelJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isTest", Boolean.TRUE.toString(),
|
||||
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-hive_metastore_uris", "",
|
||||
"-saveGraph", "true",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/relation",
|
||||
"-potentialUpdatePath", potentialUpdatepath,
|
||||
"-alreadyLinkedPath", alreadyLinkedPath,
|
||||
|
@ -244,5 +264,7 @@ public class ProjectPropagationJobTest {
|
|||
.sql(
|
||||
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
|
||||
.count());
|
||||
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue