From 53d7023460ef5d6c4bf9737cc0740c1b2965fc27 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 25 Feb 2021 18:43:29 +0100 Subject: [PATCH] dateOfCollection taken from orcid last_update.txt on hdfs; cleaned wf parameters --- .../orcidnodoi/SparkGenEnrichedOrcidWorks.java | 18 ++++++++++++++---- .../orcidnodoi/oaf/PublicationToOaf.java | 7 +++++-- ...eters.json => gen_orcid-no-doi_params.json} | 3 +-- .../doiboost/orcidnodoi/oozie_app/workflow.xml | 3 +-- 4 files changed, 21 insertions(+), 10 deletions(-) rename dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/{gen_enriched_orcid_works_parameters.json => gen_orcid-no-doi_params.json} (57%) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index d3e408078..933162f28 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -8,7 +8,9 @@ import java.util.List; import java.util.Objects; import java.util.Optional; +import eu.dnetlib.doiboost.orcid.util.HDFSUtil; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.spark.SparkConf; @@ -57,26 +59,33 @@ public class SparkGenEnrichedOrcidWorks { .toString( SparkGenEnrichedOrcidWorks.class .getResourceAsStream( - "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + "/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json"))); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); + final String hdfsServerUri = parser.get("hdfsServerUri"); final String workingPath = parser.get("workingPath"); final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); + final String orcidDataFolder = parser.get("orcidDataFolder"); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { + String lastUpdate = HDFSUtil.readFromTextFile(hdfsServerUri, workingPath, "last_update.txt"); + if (StringUtils.isBlank(lastUpdate)) { + throw new RuntimeException("last update info not found"); + } + final String dateOfCollection = lastUpdate.substring(0, 10); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); Dataset authorDataset = spark .createDataset( sc - .textFile(workingPath.concat("last_orcid_dataset/authors/*")) + .textFile(workingPath.concat(orcidDataFolder).concat("/authors/*")) .map(item -> OBJECT_MAPPER.readValue(item, AuthorSummary.class)) .filter(authorSummary -> authorSummary.getAuthorData() != null) .map(authorSummary -> authorSummary.getAuthorData()) @@ -87,7 +96,7 @@ public class SparkGenEnrichedOrcidWorks { Dataset workDataset = spark .createDataset( sc - .textFile(workingPath.concat("last_orcid_dataset/works/*")) + .textFile(workingPath.concat(orcidDataFolder).concat("/works/*")) .map(item -> OBJECT_MAPPER.readValue(item, Work.class)) .filter(work -> work.getWorkDetail() != null) .map(work -> work.getWorkDetail()) @@ -134,7 +143,8 @@ public class SparkGenEnrichedOrcidWorks { errorsGeneric, errorsInvalidTitle, errorsNotFoundAuthors, - errorsInvalidType); + errorsInvalidType, + dateOfCollection); JavaRDD oafPublicationRDD = enrichedWorksRDD .map( e -> { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index 1444bb822..f78601506 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -36,6 +36,7 @@ public class PublicationToOaf implements Serializable { public static final String OPENAIRE_PREFIX = "openaire____"; public static final String SEPARATOR = "::"; + private String dateOfCollection = ""; private final LongAccumulator parsedPublications; private final LongAccumulator enrichedPublications; private final LongAccumulator errorsGeneric; @@ -49,13 +50,15 @@ public class PublicationToOaf implements Serializable { LongAccumulator errorsGeneric, LongAccumulator errorsInvalidTitle, LongAccumulator errorsNotFoundAuthors, - LongAccumulator errorsInvalidType) { + LongAccumulator errorsInvalidType, + String dateOfCollection) { this.parsedPublications = parsedPublications; this.enrichedPublications = enrichedPublications; this.errorsGeneric = errorsGeneric; this.errorsInvalidTitle = errorsInvalidTitle; this.errorsNotFoundAuthors = errorsNotFoundAuthors; this.errorsInvalidType = errorsInvalidType; + this.dateOfCollection = dateOfCollection; } public PublicationToOaf() { @@ -137,7 +140,7 @@ public class PublicationToOaf implements Serializable { publication.setLastupdatetimestamp(new Date().getTime()); - publication.setDateofcollection("2020-10-14"); + publication.setDateofcollection(dateOfCollection); publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601()); // Adding external ids diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json similarity index 57% rename from dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json rename to dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json index c3a8f92ec..3456329b1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_orcid-no-doi_params.json @@ -1,7 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, - {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, - {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, + {"paramName":"i", "paramLongName":"orcidDataFolder", "paramDescription": "the folder of orcid data", "paramRequired": true}, {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml index 610b7cc50..6513ff7e1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcidnodoi/oozie_app/workflow.xml @@ -85,8 +85,7 @@ -w${workingPath}/ -n${nameNode} - -f- - -owno_doi_works/ + -ilast_orcid_dataset -oewno_doi_dataset