forked from D-Net/dnet-hadoop
[openorgs dedup] fixed workflow parameter declarations. Introduced support for resuming the execution from intermediate steps
This commit is contained in:
parent
d0d477cca3
commit
815b9f4d56
|
@ -154,7 +154,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
(FlatMapFunction<ConnectedComponent, Relation>) cc -> ccToMergeRel(cc, dedupConf),
|
(FlatMapFunction<ConnectedComponent, Relation>) cc -> ccToMergeRel(cc, dedupConf),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath);
|
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,7 +109,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
.rdd(),
|
.rdd(),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
saveParquet(simRels, outputPath, SaveMode.Append);
|
saveParquet(simRels, outputPath, SaveMode.Overwrite);
|
||||||
|
|
||||||
log.info("Generated " + simRels.count() + " Similarity Relations");
|
log.info("Generated " + simRels.count() + " Similarity Relations");
|
||||||
|
|
||||||
|
|
|
@ -69,10 +69,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
.map(Integer::valueOf)
|
.map(Integer::valueOf)
|
||||||
.orElse(NUM_CONNECTIONS);
|
.orElse(NUM_CONNECTIONS);
|
||||||
|
|
||||||
final String apiUrl = Optional
|
|
||||||
.ofNullable(parser.get("apiUrl"))
|
|
||||||
.orElse("");
|
|
||||||
|
|
||||||
final String dbUrl = parser.get("dbUrl");
|
final String dbUrl = parser.get("dbUrl");
|
||||||
final String dbTable = parser.get("dbTable");
|
final String dbTable = parser.get("dbTable");
|
||||||
final String dbUser = parser.get("dbUser");
|
final String dbUser = parser.get("dbUser");
|
||||||
|
@ -83,7 +79,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
log.info("workingPath: '{}'", workingPath);
|
log.info("workingPath: '{}'", workingPath);
|
||||||
log.info("numPartitions: '{}'", numConnections);
|
log.info("numPartitions: '{}'", numConnections);
|
||||||
log.info("apiUrl: '{}'", apiUrl);
|
|
||||||
log.info("dbUrl: '{}'", dbUrl);
|
log.info("dbUrl: '{}'", dbUrl);
|
||||||
log.info("dbUser: '{}'", dbUser);
|
log.info("dbUser: '{}'", dbUser);
|
||||||
log.info("table: '{}'", dbTable);
|
log.info("table: '{}'", dbTable);
|
||||||
|
@ -106,10 +101,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.jdbc(dbUrl, dbTable, connectionProperties);
|
.jdbc(dbUrl, dbTable, connectionProperties);
|
||||||
|
|
||||||
if (!apiUrl.isEmpty())
|
|
||||||
updateSimRels(apiUrl);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Dataset<OrgSimRel> createNewOrgs(
|
public static Dataset<OrgSimRel> createNewOrgs(
|
||||||
|
@ -198,18 +189,6 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String updateSimRels(final String apiUrl) throws IOException {
|
|
||||||
|
|
||||||
log.info("Updating simrels on the portal");
|
|
||||||
|
|
||||||
final HttpGet req = new HttpGet(apiUrl);
|
|
||||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
|
||||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean filterRels(Relation rel, String entityType) {
|
private static boolean filterRels(Relation rel, String entityType) {
|
||||||
|
|
||||||
switch (entityType) {
|
switch (entityType) {
|
||||||
|
|
|
@ -12,6 +12,22 @@
|
||||||
<name>actionSetId</name>
|
<name>actionSetId</name>
|
||||||
<description>id of the actionSet</description>
|
<description>id of the actionSet</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>apiUrl</name>
|
||||||
|
<description>OpenOrgs API to finalise the suggestions import procedure</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dbUrl</name>
|
||||||
|
<description>jdbc URL of the OpenOrgs database</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dbUser</name>
|
||||||
|
<description>username to access the OpenOrgs database</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>dbPwd</name>
|
||||||
|
<description>password to access the OpenOrgs database</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>path for the working directory</description>
|
<description>path for the working directory</description>
|
||||||
|
@ -75,7 +91,17 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="resetOrgSimRels"/>
|
<start to="resume_from"/>
|
||||||
|
|
||||||
|
<decision name="resume_from">
|
||||||
|
<switch>
|
||||||
|
<case to="CreateSimRels">${wf:conf('resumeFrom') eq 'CreateSimRels'}</case>
|
||||||
|
<case to="CreateMergeRels">${wf:conf('resumeFrom') eq 'CreateMergeRels'}</case>
|
||||||
|
<case to="PrepareOrgRels">${wf:conf('resumeFrom') eq 'PrepareOrgRels'}</case>
|
||||||
|
<case to="update_openorgs">${wf:conf('resumeFrom') eq 'update_openorgs'}</case>
|
||||||
|
<default to="resetOrgSimRels"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
@ -83,8 +109,8 @@
|
||||||
|
|
||||||
<action name="resetOrgSimRels">
|
<action name="resetOrgSimRels">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_simrel"/>
|
<delete path="${workingPath}/${actionSetId}/organization_simrel"/>
|
||||||
<delete path="${workingPath}/${actionSetIdOpenorgs}/organization_mergerel"/>
|
<delete path="${workingPath}/${actionSetId}/organization_mergerel"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="CreateSimRels"/>
|
<ok to="CreateSimRels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -109,7 +135,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--numPartitions</arg><arg>1000</arg>
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -138,7 +164,7 @@
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--numPartitions</arg><arg>1000</arg>
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CreateMergeRels"/>
|
<ok to="CreateMergeRels"/>
|
||||||
|
@ -165,7 +191,7 @@
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="PrepareOrgRels"/>
|
<ok to="PrepareOrgRels"/>
|
||||||
|
@ -192,7 +218,7 @@
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
||||||
|
@ -223,14 +249,24 @@
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetIdOpenorgs}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--apiUrl</arg><arg>${apiUrl}</arg>
|
|
||||||
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
<arg>--dbUrl</arg><arg>${dbUrl}</arg>
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
||||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
||||||
<arg>--numConnections</arg><arg>20</arg>
|
<arg>--numConnections</arg><arg>20</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="update_openorgs"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="update_openorgs">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>/usr/bin/curl</exec>
|
||||||
|
<argument>${apiUrl}</argument>
|
||||||
|
</shell>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
Loading…
Reference in New Issue