From ca37d3427bc4bfe05932c9231e11ccdfb98752f2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Fri, 3 Jul 2020 23:30:31 +0200 Subject: [PATCH] separate workflow to parse orcid summaries, activities and generate dataset with no doi publications; test --- .../orcid/OrcidAuthorsDOIsDataGen.java | 8 +- .../doiboost/orcid/OrcidDSManager.java | 14 +- .../doiboost/orcid/OrcidDownloader.java | 8 +- .../orcidnodoi/ActivitiesDumpReader.java | 6 +- .../orcidnodoi/GenOrcidAuthorWork.java | 3 +- .../SparkGenEnrichedOrcidWorks.java | 18 +- .../orcidnodoi/oaf/PublicationToOaf.java | 9 +- .../doiboost/create_orcid_authors_data.json | 2 +- .../create_orcid_authors_dois_data.json | 2 +- .../dhp/doiboost/download_orcid_data.json | 2 +- .../oozie_app/workflow.xml | 497 +---------------- .../dhp/doiboost/orcid/oozie_app/workflow.xml | 44 +- .../oozie_app/config-default.xml | 31 ++ .../orcid_activities/oozie_app/workflow.xml | 514 ++++++++++++++++++ .../oozie_app/config-default.xml | 22 + .../orcid_summaries/oozie_app/workflow.xml | 68 +++ .../doiboost/orcid/OrcidClientTest.java | 29 +- .../orcidnodoi/PublicationToOafTest.java | 5 +- .../orcidnodoi/xml/OrcidNoDoiTest.java | 4 +- .../xml/activity_work_0000-0002-2536-4498.xml | 72 +++ 20 files changed, 815 insertions(+), 543 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java index 70528a8f60..2ec4fe59db 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidAuthorsDOIsDataGen.java @@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { public void generateAuthorsDOIsData() throws Exception { Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz); - Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath)); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath)); ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath); } @@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Default Path: " + workingPath); activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); Log.info("Activities File Name: " + activitiesFileNameTarGz); outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 4f846bdf3c..aa61c0117c 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; public class OrcidDSManager { protected String hdfsServerUri; - protected String hdfsOrcidDefaultPath; + protected String workingPath; private String summariesFileNameTarGz; private String outputAuthorsPath; @@ -28,10 +28,10 @@ public class OrcidDSManager { public void generateAuthors() throws Exception { Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz); Path outputPath = new Path( hdfsServerUri - .concat(hdfsOrcidDefaultPath) + .concat(workingPath) .concat(outputAuthorsPath) .concat("authors.seq")); SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); @@ -41,7 +41,7 @@ public class OrcidDSManager { // ====== Init HDFS File System Object Configuration conf = new Configuration(); // Set FileSystem URI - conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath)); + conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath)); // Because of Maven conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); @@ -52,7 +52,7 @@ public class OrcidDSManager { // Get the filesystem - HDFS FileSystem fs = null; try { - fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf); + fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -71,8 +71,8 @@ public class OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); Log.info("Summaries File Name: " + summariesFileNameTarGz); outputAuthorsPath = parser.get("outputAuthorsPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java index 2e1a199da9..762d8aecd8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDownloader.java @@ -69,12 +69,12 @@ public class OrcidDownloader extends OrcidDSManager { long startDownload = 0; Configuration conf = initConfigurationObject(); FileSystem fs = initFileSystemObject(conf); - String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName); + String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName); Path hdfsreadpath = new Path(lambdaFileUri); FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath); Path hdfsoutputPath = new Path( hdfsServerUri - .concat(hdfsOrcidDefaultPath) + .concat(workingPath) .concat(outputPath) .concat("orcid_records.seq")); @@ -176,8 +176,8 @@ public class OrcidDownloader extends OrcidDSManager { hdfsServerUri = parser.get("hdfsServerUri"); Log.info("HDFS URI: " + hdfsServerUri); - hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath"); - Log.info("Default Path: " + hdfsOrcidDefaultPath); + workingPath = parser.get("workingPath"); + Log.info("Default Path: " + workingPath); lambdaFileName = parser.get("lambdaFileName"); Log.info("Lambda File Name: " + lambdaFileName); outputPath = parser.get("outputPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java index bf63568d8b..807f52972a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -26,8 +26,8 @@ import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; public class ActivitiesDumpReader { - private static final int MAX_XML_WORKS_PARSED = 100; - private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 10; + private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) throws Exception { @@ -127,7 +127,7 @@ public class ActivitiesDumpReader { Log .warn( "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); - Log.warn(e); +// Log.warn(e); } if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java index 8dcee796c2..041424ba9a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -16,7 +16,7 @@ public class GenOrcidAuthorWork extends OrcidDSManager { private String activitiesFileNameTarGz; private String outputWorksPath; - private String workingPath; +// private String workingPath; public static void main(String[] args) throws IOException, Exception { GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); @@ -45,7 +45,6 @@ public class GenOrcidAuthorWork extends OrcidDSManager { Log.info("HDFS URI: " + hdfsServerUri); workingPath = parser.get("workingPath"); Log.info("Working Path: " + workingPath); - hdfsOrcidDefaultPath = workingPath; activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); Log.info("Activities File Name: " + activitiesFileNameTarGz); outputWorksPath = parser.get("outputWorksPath"); diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java index ae1e4dae61..b0b989463b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -16,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,9 +25,11 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import scala.Tuple2; @@ -59,7 +62,7 @@ public class SparkGenEnrichedOrcidWorks { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaPairRDD summariesRDD = sc - .sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); + .sequenceFile(workingPath + "summaries/output/authors.seq", Text.class, Text.class); Dataset summariesDataset = spark .createDataset( summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), @@ -89,8 +92,19 @@ public class SparkGenEnrichedOrcidWorks { .filter(Objects::nonNull) .toJavaRDD(); logger.info("Works enriched data created: " + enrichedWorksRDD.count()); - enrichedWorksRDD.repartition(10).saveAsTextFile(workingPath + outputEnrichedWorksPath); + enrichedWorksRDD.saveAsTextFile(workingPath + outputEnrichedWorksPath); logger.info("Works enriched data saved"); + JavaRDD> oafPublicationRDD = enrichedWorksRDD.map(e -> { + JsonElement j = new JsonParser().parse(e._2()); + return new Tuple2<>(e._1(), (Publication) PublicationToOaf + .generatePublicationActionsFromDump(j.getAsJsonObject())); + }); + + Dataset> publicationDataset = spark + .createDataset( + oafPublicationRDD.repartition(1).rdd(), + Encoders.tuple(Encoders.STRING(), Encoders.bean(Publication.class))); + publicationDataset.write().mode(SaveMode.Overwrite).save(workingPath + "no_doi_dataset/output"); }); } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java index dc03767ecf..19bfe0f301 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/oaf/PublicationToOaf.java @@ -172,7 +172,7 @@ public class PublicationToOaf { instance.setUrl(urls); } - final String pubDate = getPublicationDate(rootElement, "publication_date"); + final String pubDate = getPublicationDate(rootElement, "publicationDates"); if (StringUtils.isNotBlank(pubDate)) { instance.setDateofacceptance(mapStringField(pubDate, null)); } @@ -325,7 +325,12 @@ public class PublicationToOaf { private static String getPublicationDate(final JsonObject rootElement, final String jsonKey) { - final JsonObject pubDateJson = rootElement.getAsJsonObject(jsonKey); + JsonObject pubDateJson = null; + try { + pubDateJson = rootElement.getAsJsonObject(jsonKey); + } catch (Exception e) { + return null; + } if (pubDateJson == null) { return null; } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json index bf992b5086..6f213e4150 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json index 131c301257..b2f0fdedac 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json index 444e487f7b..8c69b168b5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_orcid_data.json @@ -1,6 +1,6 @@ [ {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, - {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, {"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true}, {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, {"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml index df5e0e76f7..a60af8b456 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -1,75 +1,9 @@ - workingPath_activities + workingPath the working dir base path - - shell_cmd_0 - wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 0 - - - shell_cmd_1 - wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 1 - - - shell_cmd_2 - wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 2 - - - shell_cmd_3 - wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 3 - - - shell_cmd_4 - wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 4 - - - shell_cmd_5 - wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 5 - - - shell_cmd_6 - wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 6 - - - shell_cmd_7 - wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 7 - - - shell_cmd_8 - wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 8 - - - shell_cmd_9 - wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file 9 - - - shell_cmd_X - wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz - - the shell command that downloads and puts to hdfs orcid activity file X - @@ -80,436 +14,11 @@ - - + - + - - - - - - - - - - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_0} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_0.tar.gz - -owno_doi_works/works_0.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_1} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_1.tar.gz - -owno_doi_works/works_1.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_2} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_2.tar.gz - -owno_doi_works/works_2.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_3} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_3.tar.gz - -owno_doi_works/works_3.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_4} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_4.tar.gz - -owno_doi_works/works_4.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_5} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_5.tar.gz - -owno_doi_works/works_5.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_6} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_6.tar.gz - -owno_doi_works/works_6.seq - -oewno_doi_enriched_works/ - - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_7} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_7.tar.gz - -owno_doi_works/works_7.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_8} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_8.tar.gz - -owno_doi_works/works_8.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_9} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_9.tar.gz - -owno_doi_works/works_9.seq - -oewno_doi_enriched_works/ - - - - - - - - - ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} - - - - - - - - ${jobTracker} - ${nameNode} - bash - -c - ${shell_cmd_X} - - - - - - - - - ${jobTracker} - ${nameNode} - eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork - -w${workingPath_activities}/ - -n${nameNode} - -fORCID_2019_activites_X.tar.gz - -owno_doi_works/works_X.seq - -oewno_doi_enriched_works/ - - - - - - diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml index 7a8d041870..51e00dc0f8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid/oozie_app/workflow.xml @@ -1,9 +1,15 @@ - + workingPath the working dir base path + + shell_cmd_0 + wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + + the shell command that downloads and puts to hdfs orcid summaries + @@ -15,24 +21,44 @@ - - + + - + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + - - - + ${jobTracker} ${nameNode} eu.dnetlib.doiboost.orcid.OrcidDSManager - -d${workingPath}/ + -w${workingPath}/ -n${nameNode} -fORCID_2019_summaries.tar.gz - -ooutput/ + -osummaries/output/ diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml new file mode 100644 index 0000000000..3068562d06 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/config-default.xml @@ -0,0 +1,31 @@ + + + oozie.action.sharelib.for.java + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml new file mode 100644 index 0000000000..8f9a5123e6 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_activities/oozie_app/workflow.xml @@ -0,0 +1,514 @@ + + + + workingPath + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 0 + + + shell_cmd_1 + wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 1 + + + shell_cmd_2 + wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 2 + + + shell_cmd_3 + wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 3 + + + shell_cmd_4 + wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 4 + + + shell_cmd_5 + wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 5 + + + shell_cmd_6 + wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 6 + + + shell_cmd_7 + wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 7 + + + shell_cmd_8 + wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 8 + + + shell_cmd_9 + wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 9 + + + shell_cmd_X + wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file X + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_0.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_0.tar.gz + -owno_doi_works/works_0.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_1.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_1} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_1.tar.gz + -owno_doi_works/works_1.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_2.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_2} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_2.tar.gz + -owno_doi_works/works_2.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_3.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_3} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_3.tar.gz + -owno_doi_works/works_3.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_4.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_4} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_4.tar.gz + -owno_doi_works/works_4.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_5.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_5} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_5.tar.gz + -owno_doi_works/works_5.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_6.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_6} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_6.tar.gz + -owno_doi_works/works_6.seq + -oewno_doi_enriched_works/ + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_7.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_7} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_7.tar.gz + -owno_doi_works/works_7.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_8.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_8} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_8.tar.gz + -owno_doi_works/works_8.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_9.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_9} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_9.tar.gz + -owno_doi_works/works_9.seq + -oewno_doi_enriched_works/ + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_activites_X.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_X} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_activites_X.tar.gz + -owno_doi_works/works_X.seq + -oewno_doi_enriched_works/ + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml new file mode 100644 index 0000000000..e77dd09c9d --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml new file mode 100644 index 0000000000..3362cc67b7 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid_summaries/oozie_app/workflow.xml @@ -0,0 +1,68 @@ + + + + workingPath + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/18017633 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_summaries.tar.gz /data/orcid_activities/ORCID_2019_summaries.tar.gz ; rm -f /tmp/ORCID_2019_summaries.tar.gz + + the shell command that downloads and puts to hdfs orcid summaries + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath,'/ORCID_2019_summaries.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcid.OrcidDSManager + -w${workingPath}/ + -n${nameNode} + -fORCID_2019_summaries.tar.gz + -osummaries/output/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 8b50f2d8f9..5e0f91ecd9 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -3,9 +3,8 @@ package eu.dnetlib.doiboost.orcid; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; +import java.nio.file.Files; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Arrays; @@ -20,6 +19,7 @@ import org.apache.http.impl.client.HttpClients; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidClientTest { final String orcidId = "0000-0001-7291-3210"; @@ -32,11 +32,20 @@ public class OrcidClientTest { String lastUpdate = "2019-09-30 00:00:00"; String shortDate = "2020-05-06 16:06:11"; -// curl -i -H "Accept: application/vnd.orcid+xml" +// curl -i -H "Accept: application/vnd.orcid+xml" // -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d' // 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record' - public String testDownloadRecord(String orcidId) throws Exception { + @Test + public void downloadTest() throws Exception { + String record = testDownloadRecord("0000-0002-2536-4498"); + File f = new File("/tmp/downloaded_0000-0002-2536-4498.xml"); + OutputStream outStream = new FileOutputStream(f); + IOUtils.write(record.getBytes(), outStream); + System.out.println("saved to tmp"); + } + + private String testDownloadRecord(String orcidId) throws Exception { try (CloseableHttpClient client = HttpClients.createDefault()) { HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record"); httpGet.addHeader("Accept", "application/vnd.orcid+xml"); @@ -100,7 +109,7 @@ public class OrcidClientTest { } // @Test - public void getRecordDatestamp() throws ParseException { + private void getRecordDatestamp() throws ParseException { Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate); Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate); Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate); @@ -108,7 +117,7 @@ public class OrcidClientTest { assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt)); } - public void testDate(String value) throws ParseException { + private void testDate(String value) throws ParseException { System.out.println(value.toString()); if (value.length() != 19) { value = value.substring(0, 19); @@ -118,14 +127,16 @@ public class OrcidClientTest { } // @Test - public void testModifiedDate() throws ParseException { + @Ignore + private void testModifiedDate() throws ParseException { testDate(toRetrieveDate); testDate(toNotRetrieveDate); testDate(shortDate); } // @Test - public void testReadBase64CompressedRecord() throws Exception { + @Ignore + private void testReadBase64CompressedRecord() throws Exception { final String base64CompressedRecord = IOUtils .toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64")); final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java index 4d04e1a164..39f78522f7 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/PublicationToOafTest.java @@ -13,14 +13,15 @@ import com.google.gson.JsonParser; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; +import jdk.nashorn.internal.ir.annotations.Ignore; public class PublicationToOafTest { private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class); @Test -// @Ignore - public void convertOafPublicationTest() throws Exception { + @Ignore + private void convertOafPublicationTest() throws Exception { String jsonPublication = IOUtils .toString( PublicationToOafTest.class.getResourceAsStream("publication.json")); diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index d426b01f17..ca91a242a1 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -42,12 +42,12 @@ public class OrcidNoDoiTest { @Test @Ignore - private void readPublicationFieldsTest() + public void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); String xml = IOUtils .toString( - OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191.xml")); + OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml")); if (xml == null) { logger.info("Resource not found"); diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml new file mode 100644 index 0000000000..43d3b23515 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/orcidnodoi/xml/activity_work_0000-0002-2536-4498.xml @@ -0,0 +1,72 @@ + + + 2019-10-22T03:18:13.755Z + 2020-06-17T11:07:13.703Z + + + https://orcid.org/client/0000-0001-8607-8906 + 0000-0001-8607-8906 + orcid.org + + INSPIRE-HEP + + + Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector + + + + other-id + 1759875 + 1759875 + http://inspirehep.net/record/1759875 + self + + + doi + 10.1140/epjc/s10052-020-7907-9 + 10.1140/epjc/s10052-020-7907-9 + http://dx.doi.org/10.1140/epjc/s10052-020-7907-9 + self + + + arxiv + 1910.08819 + arXiv:1910.08819 + http://arxiv.org/abs/1910.08819 + self + + + http://inspirehep.net/record/1759875 + journal-article + + 2020 + 06 + 12 + + Eur.Phys.J.C +