From 4ae55e3891cf22de93a243b79d3393e1070051e2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 20 Apr 2020 12:00:04 +0200 Subject: [PATCH 1/2] added workflow parameters --- .../doiboost/orcid/OrcidDSManager.java | 62 ++++++------------- .../doiboost/orcid/SummariesDecompressor.java | 4 +- .../doiboost/create_orcid_authors_data.json | 6 ++ .../dhp/doiboost/orcid.oozie_app/workflow.xml | 6 +- 4 files changed, 30 insertions(+), 48 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 05e280d41..daff69623 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -1,35 +1,30 @@ package eu.dnetlib.doiboost.orcid; -import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; -import java.util.Properties; - +import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + public class OrcidDSManager { private static final Logger logger = LoggerFactory.getLogger(OrcidDSManager.class); private String hdfsServerUri; - private String hadoopUsername; private String hdfsOrcidDefaultPath; private String summariesFileNameTarGz; private String outputAuthorsPath; - public static void main(String[] args) { + public static void main(String[] args) throws IOException, Exception { logger.info("OrcidDSManager started"); OrcidDSManager orcidDSManager = new OrcidDSManager(); - try { - orcidDSManager.initGARRProperties(); - orcidDSManager.generateAuthors(); - } catch (Exception e) { - logger.error("Generating authors data: "+e.getMessage()); - } + orcidDSManager.loadArgs(args); + orcidDSManager.generateAuthors(); } public void generateAuthors() throws Exception { @@ -49,9 +44,6 @@ public class OrcidDSManager { // Because of Maven conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - // Set HADOOP user - System.setProperty("HADOOP_USER_NAME", hadoopUsername); - System.setProperty("hadoop.home.dir", "/"); return conf; } @@ -66,34 +58,18 @@ public class OrcidDSManager { } return fs; } - - private void loadProperties() throws FileNotFoundException, IOException { - - Properties appProps = new Properties(); - ClassLoader classLoader = ClassLoader.getSystemClassLoader(); - appProps.load(classLoader.getResourceAsStream("orciddsmanager/props/app.properties")); - hdfsServerUri = appProps.getProperty("hdfs.server.uri"); - hadoopUsername = appProps.getProperty("hdfs.hadoopusername"); - hdfsOrcidDefaultPath = appProps.getProperty("hdfs.orcid.defaultpath"); - summariesFileNameTarGz = appProps.getProperty("hdfs.orcid.summariesfilename.tar.gz"); - outputAuthorsPath = appProps.getProperty("hdfs.orcid.output.authorspath"); - } - - private void initDefaultProperties() throws FileNotFoundException, IOException { - - hdfsServerUri = "hdfs://localhost:9000"; - hadoopUsername = "enrico.ottonello"; - hdfsOrcidDefaultPath = "/user/enrico.ottonello/orcid/"; - summariesFileNameTarGz = "ORCID_2019_summaries.tar.gz"; - outputAuthorsPath = "output/"; - } - - private void initGARRProperties() throws FileNotFoundException, IOException { - - hdfsServerUri = "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020"; - hadoopUsername = "root"; - hdfsOrcidDefaultPath = "/data/orcid_summaries/"; - summariesFileNameTarGz = "ORCID_2019_summaries.tar.gz"; - outputAuthorsPath = "output/"; + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(OrcidDSManager.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + parser.parseArgument(args); + + final String hdfsServerUri = parser.get("hdfsServerUri"); + logger.info("HDFS URI: "+hdfsServerUri); + Path hdfsOrcidDefaultPath = new Path(parser.get("hdfsOrcidDefaultPath")); + logger.info("Default Path: "+hdfsOrcidDefaultPath); + final String summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + logger.info("Summaries File Name: "+summariesFileNameTarGz); + final String outputAuthorsPath = parser.get("summariesFileNameTarGz"); + logger.info("Output Authors Data: "+outputAuthorsPath); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index d8eb49013..2a8d6d6de 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -122,14 +122,14 @@ public class SummariesDecompressor { } } - if ((counter % 1000) == 0) { + if ((counter % 100000) == 0) { logger.info("Current xml records parsed: "+counter); } } } } catch (IOException e) { logger.error("Parsing record from gzip archive: "+e.getMessage()); - e.printStackTrace(); + throw new RuntimeException(e); } logger.info("Summaries parse completed"); logger.info("Total XML records parsed: "+counter); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json new file mode 100644 index 000000000..bf992b508 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json @@ -0,0 +1,6 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml index 5fb25a696..36e0cdf1b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml @@ -29,10 +29,10 @@ ${jobTracker} ${nameNode} eu.dnetlib.doiboost.orcid.OrcidDSManager - - -t${workingPath}/input/crossref/index_dump + -d${workingPath}/ -n${nameNode} - + -fORCID_2019_summaries.tar.gz + -ooutput/ From a466648b4bdbc899d90c27442c8cc573d5bdf59e Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Mon, 20 Apr 2020 12:32:03 +0200 Subject: [PATCH 2/2] renamed output file --- .../src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index daff69623..e350877a9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -32,7 +32,7 @@ public class OrcidDSManager { FileSystem fs = initFileSystemObject(conf); String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); logger.info("Started parsing "+tarGzUri); - Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors_part")); + Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors.seq")); SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); }