forked from D-Net/dnet-hadoop
workflow aligned with stable_ids
This commit is contained in:
parent
0821d8e97d
commit
1265dadc90
|
@ -62,7 +62,7 @@ public class SparkDownloadOrcidAuthors {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||||
logger.info("lastUpdate: ", lastUpdate);
|
logger.info("lastUpdate: {}", lastUpdate);
|
||||||
if (StringUtils.isBlank(lastUpdate)) {
|
if (StringUtils.isBlank(lastUpdate)) {
|
||||||
throw new RuntimeException("last update info not found");
|
throw new RuntimeException("last update info not found");
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,7 +174,9 @@ public class PublicationToOaf implements Serializable {
|
||||||
publication
|
publication
|
||||||
.getExternalReference()
|
.getExternalReference()
|
||||||
.add(
|
.add(
|
||||||
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
|
convertExtRef(
|
||||||
|
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -1,42 +0,0 @@
|
||||||
<workflow-app name="import_orcid_no_doi" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>inputPath</name>
|
|
||||||
<value>/data/orcid_activities_2020/no_doi_dataset</value>
|
|
||||||
<description>path where retrieve the already generated action set</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>outputPath</name>
|
|
||||||
<value>/data/orcid_activities_2020/test_import_orcid_no_doi</value>
|
|
||||||
<description>path where to store the action set</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<global>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>${oozieActionShareLibForSpark2}</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
</global>
|
|
||||||
|
|
||||||
<start to="importOrcidNoDoi"/>
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<action name="importOrcidNoDoi">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${inputPath}/*</arg>
|
|
||||||
<arg>${outputPath}</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -1,5 +1,15 @@
|
||||||
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<value>/data/orcid_activities_2020</value>
|
||||||
|
<description>path where the collection workflow stores the ORCID data</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>path where to store the action set</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>spark2GenNoDoiDatasetMaxExecutors</name>
|
<name>spark2GenNoDoiDatasetMaxExecutors</name>
|
||||||
<value>40</value>
|
<value>40</value>
|
||||||
|
@ -35,10 +45,6 @@
|
||||||
<name>spark2EventLogDir</name>
|
<name>spark2EventLogDir</name>
|
||||||
<description>spark 2.* event log dir location</description>
|
<description>spark 2.* event log dir location</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<description>the working dir base path</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
@ -83,11 +89,20 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
<arg>--workingPath</arg><arg>${workingPath}/</arg>
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||||
<arg>-i</arg><arg>last_orcid_dataset</arg>
|
<arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg>
|
||||||
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
<arg>--outputEnrichedWorksPath</arg><arg>no_doi_dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="importOrcidNoDoi"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="importOrcidNoDoi">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<arg>${workingPath}/no_doi_dataset/*</arg>
|
||||||
|
<arg>${outputPath}</arg>
|
||||||
|
</distcp>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
Loading…
Reference in New Issue