[openorgs dedup] fixed workflow parameter declarations. Introduced support for resuming the execution from intermediate steps

This commit is contained in:
Claudio Atzori 2021-04-20 17:24:45 +02:00
parent d0d477cca3
commit 815b9f4d56
4 changed files with 47 additions and 32 deletions

View File

@ -154,7 +154,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
(FlatMapFunction<ConnectedComponent, Relation>) cc -> ccToMergeRel(cc, dedupConf),
Encoders.bean(Relation.class));
mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath);
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelPath);
}
}

View File

@ -109,7 +109,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
.rdd(),
Encoders.bean(Relation.class));
saveParquet(simRels, outputPath, SaveMode.Append);
saveParquet(simRels, outputPath, SaveMode.Overwrite);
log.info("Generated " + simRels.count() + " Similarity Relations");

View File

@ -69,10 +69,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
.map(Integer::valueOf)
.orElse(NUM_CONNECTIONS);
final String apiUrl = Optional
.ofNullable(parser.get("apiUrl"))
.orElse("");
final String dbUrl = parser.get("dbUrl");
final String dbTable = parser.get("dbTable");
final String dbUser = parser.get("dbUser");
@ -83,7 +79,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numConnections);
log.info("apiUrl: '{}'", apiUrl);
log.info("dbUrl: '{}'", dbUrl);
log.info("dbUser: '{}'", dbUser);
log.info("table: '{}'", dbTable);
@ -106,10 +101,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
.write()
.mode(SaveMode.Append)
.jdbc(dbUrl, dbTable, connectionProperties);
if (!apiUrl.isEmpty())
updateSimRels(apiUrl);
}
public static Dataset<OrgSimRel> createNewOrgs(
@ -198,18 +189,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
}
private static String updateSimRels(final String apiUrl) throws IOException {
log.info("Updating simrels on the portal");
final HttpGet req = new HttpGet(apiUrl);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
return IOUtils.toString(response.getEntity().getContent());
}
}
}
private static boolean filterRels(Relation rel, String entityType) {
switch (entityType) {

View File

@ -12,6 +12,22 @@
<name>actionSetId</name>
<description>id of the actionSet</description>
</property>
<property>
<name>apiUrl</name>
<description>OpenOrgs API to finalise the suggestions import procedure</description>
</property>
<property>
<name>dbUrl</name>
<description>jdbc URL of the OpenOrgs database</description>
</property>
<property>
<name>dbUser</name>
<description>username to access the OpenOrgs database</description>
</property>
<property>
<name>dbPwd</name>
<description>password to access the OpenOrgs database</description>
</property>
<property>
<name>workingPath</name>
<description>path for the working directory</description>
@ -75,7 +91,17 @@
</configuration>
</global>
<start to="resetOrgSimRels"/>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="CreateSimRels">${wf:conf('resumeFrom') eq 'CreateSimRels'}</case>
<case to="CreateMergeRels">${wf:conf('resumeFrom') eq 'CreateMergeRels'}</case>
<case to="PrepareOrgRels">${wf:conf('resumeFrom') eq 'PrepareOrgRels'}</case>
<case to="update_openorgs">${wf:conf('resumeFrom') eq 'update_openorgs'}</case>
<default to="resetOrgSimRels"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@ -83,8 +109,8 @@
<action name="resetOrgSimRels">
<fs>
<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_simrel"/>
<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_mergerel"/>
<delete path="${workingPath}/${actionSetId}/organization_simrel"/>
<delete path="${workingPath}/${actionSetId}/organization_mergerel"/>
</fs>
<ok to="CreateSimRels"/>
<error to="Kill"/>
@ -109,7 +135,7 @@
</spark-opts>
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>1000</arg>
</spark>
@ -138,7 +164,7 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--numPartitions</arg><arg>1000</arg>
</spark>
<ok to="CreateMergeRels"/>
@ -165,7 +191,7 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
</spark>
<ok to="PrepareOrgRels"/>
@ -192,7 +218,7 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
<arg>--dbTable</arg><arg>${dbTable}</arg>
<arg>--dbUser</arg><arg>${dbUser}</arg>
@ -223,14 +249,24 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
<arg>--apiUrl</arg><arg>${apiUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
<arg>--dbTable</arg><arg>${dbTable}</arg>
<arg>--dbUser</arg><arg>${dbUser}</arg>
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
<arg>--numConnections</arg><arg>20</arg>
</spark>
<ok to="update_openorgs"/>
<error to="Kill"/>
</action>
<action name="update_openorgs">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>/usr/bin/curl</exec>
<argument>${apiUrl}</argument>
</shell>
<ok to="End"/>
<error to="Kill"/>
</action>