orcid-no-doi #123

Merged
claudio.atzori merged 13 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2021-07-15 17:53:59 +02:00
4 changed files with 27 additions and 52 deletions
Showing only changes of commit 1265dadc90 - Show all commits

View File

@ -62,7 +62,7 @@ public class SparkDownloadOrcidAuthors {
isSparkSessionManaged,
spark -> {
String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt");
logger.info("lastUpdate: ", lastUpdate);
logger.info("lastUpdate: {}", lastUpdate);
if (StringUtils.isBlank(lastUpdate)) {
throw new RuntimeException("last update info not found");
}

View File

@ -174,7 +174,9 @@ public class PublicationToOaf implements Serializable {
publication
.getExternalReference()
.add(
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
convertExtRef(
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES));
}
});

View File

@ -1,42 +0,0 @@
<workflow-app name="import_orcid_no_doi" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<value>/data/orcid_activities_2020/no_doi_dataset</value>
<description>path where retrieve the already generated action set</description>
</property>
<property>
<name>outputPath</name>
<value>/data/orcid_activities_2020/test_import_orcid_no_doi</value>
<description>path where to store the action set</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="importOrcidNoDoi"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="importOrcidNoDoi">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${inputPath}/*</arg>
<arg>${outputPath}</arg>
</distcp>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,5 +1,15 @@
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<value>/data/orcid_activities_2020</value>
<description>path where the collection workflow stores the ORCID data</description>
</property>
<property>
<name>outputPath</name>
<description>path where to store the action set</description>
</property>
<property>
<name>spark2GenNoDoiDatasetMaxExecutors</name>
<value>40</value>
@ -35,10 +45,6 @@
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
@ -83,11 +89,20 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-i</arg><arg>last_orcid_dataset</arg>
<arg>-oew</arg><arg>no_doi_dataset</arg>
<arg>--workingPath</arg><arg>${workingPath}/</arg>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--orcidDataFolder</arg><arg>last_orcid_dataset</arg>
<arg>--outputEnrichedWorksPath</arg><arg>no_doi_dataset</arg>
</spark>
<ok to="importOrcidNoDoi"/>
<error to="Kill"/>
</action>
<action name="importOrcidNoDoi">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${workingPath}/no_doi_dataset/*</arg>
<arg>${outputPath}</arg>
</distcp>
<ok to="End"/>
<error to="Kill"/>
</action>