forked from D-Net/dnet-hadoop
workflow aligned with stable_ids
This commit is contained in:
parent
0821d8e97d
commit
1265dadc90
|
@ -62,7 +62,7 @@ public class SparkDownloadOrcidAuthors {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
|
||||
logger.info("lastUpdate: ", lastUpdate);
|
||||
logger.info("lastUpdate: {}", lastUpdate);
|
||||
if (StringUtils.isBlank(lastUpdate)) {
|
||||
throw new RuntimeException("last update info not found");
|
||||
}
|
||||
|
|
|
@ -174,7 +174,9 @@ public class PublicationToOaf implements Serializable {
|
|||
publication
|
||||
.getExternalReference()
|
||||
.add(
|
||||
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
|
||||
convertExtRef(
|
||||
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES));
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
<workflow-app name="import_orcid_no_doi" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<value>/data/orcid_activities_2020/no_doi_dataset</value>
|
||||
<description>path where retrieve the already generated action set</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>/data/orcid_activities_2020/test_import_orcid_no_doi</value>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="importOrcidNoDoi"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="importOrcidNoDoi">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${inputPath}/*</arg>
|
||||
<arg>${outputPath}</arg>
|
||||
</distcp>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,5 +1,15 @@
|
|||
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/orcid_activities_2020</value>
|
||||
<description>path where the collection workflow stores the ORCID data</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>spark2GenNoDoiDatasetMaxExecutors</name>
|
||||
<value>40</value>
|
||||
|
@ -35,10 +45,6 @@
|
|||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
|
@ -83,11 +89,20 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-i</arg><arg>last_orcid_dataset</arg>
|
||||
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}/</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg>
|
||||
<arg>--outputEnrichedWorksPath</arg><arg>no_doi_dataset</arg>
|
||||
</spark>
|
||||
<ok to="importOrcidNoDoi"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="importOrcidNoDoi">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<arg>${workingPath}/no_doi_dataset/*</arg>
|
||||
<arg>${outputPath}</arg>
|
||||
</distcp>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
Loading…
Reference in New Issue