diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java index 05e280d41..e350877a9 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/OrcidDSManager.java @@ -1,35 +1,30 @@ package eu.dnetlib.doiboost.orcid; -import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; -import java.util.Properties; - +import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + public class OrcidDSManager { private static final Logger logger = LoggerFactory.getLogger(OrcidDSManager.class); private String hdfsServerUri; - private String hadoopUsername; private String hdfsOrcidDefaultPath; private String summariesFileNameTarGz; private String outputAuthorsPath; - public static void main(String[] args) { + public static void main(String[] args) throws IOException, Exception { logger.info("OrcidDSManager started"); OrcidDSManager orcidDSManager = new OrcidDSManager(); - try { - orcidDSManager.initGARRProperties(); - orcidDSManager.generateAuthors(); - } catch (Exception e) { - logger.error("Generating authors data: "+e.getMessage()); - } + orcidDSManager.loadArgs(args); + orcidDSManager.generateAuthors(); } public void generateAuthors() throws Exception { @@ -37,7 +32,7 @@ public class OrcidDSManager { FileSystem fs = initFileSystemObject(conf); String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz); logger.info("Started parsing "+tarGzUri); - Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors_part")); + Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsPath).concat(Long.toString(System.currentTimeMillis())).concat("/authors.seq")); SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath); } @@ -49,9 +44,6 @@ public class OrcidDSManager { // Because of Maven conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - // Set HADOOP user - System.setProperty("HADOOP_USER_NAME", hadoopUsername); - System.setProperty("hadoop.home.dir", "/"); return conf; } @@ -66,34 +58,18 @@ public class OrcidDSManager { } return fs; } - - private void loadProperties() throws FileNotFoundException, IOException { - - Properties appProps = new Properties(); - ClassLoader classLoader = ClassLoader.getSystemClassLoader(); - appProps.load(classLoader.getResourceAsStream("orciddsmanager/props/app.properties")); - hdfsServerUri = appProps.getProperty("hdfs.server.uri"); - hadoopUsername = appProps.getProperty("hdfs.hadoopusername"); - hdfsOrcidDefaultPath = appProps.getProperty("hdfs.orcid.defaultpath"); - summariesFileNameTarGz = appProps.getProperty("hdfs.orcid.summariesfilename.tar.gz"); - outputAuthorsPath = appProps.getProperty("hdfs.orcid.output.authorspath"); - } - - private void initDefaultProperties() throws FileNotFoundException, IOException { - - hdfsServerUri = "hdfs://localhost:9000"; - hadoopUsername = "enrico.ottonello"; - hdfsOrcidDefaultPath = "/user/enrico.ottonello/orcid/"; - summariesFileNameTarGz = "ORCID_2019_summaries.tar.gz"; - outputAuthorsPath = "output/"; - } - - private void initGARRProperties() throws FileNotFoundException, IOException { - - hdfsServerUri = "hdfs://hadoop-rm1.garr-pa1.d4science.org:8020"; - hadoopUsername = "root"; - hdfsOrcidDefaultPath = "/data/orcid_summaries/"; - summariesFileNameTarGz = "ORCID_2019_summaries.tar.gz"; - outputAuthorsPath = "output/"; + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(OrcidDSManager.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json"))); + parser.parseArgument(args); + + final String hdfsServerUri = parser.get("hdfsServerUri"); + logger.info("HDFS URI: "+hdfsServerUri); + Path hdfsOrcidDefaultPath = new Path(parser.get("hdfsOrcidDefaultPath")); + logger.info("Default Path: "+hdfsOrcidDefaultPath); + final String summariesFileNameTarGz = parser.get("summariesFileNameTarGz"); + logger.info("Summaries File Name: "+summariesFileNameTarGz); + final String outputAuthorsPath = parser.get("summariesFileNameTarGz"); + logger.info("Output Authors Data: "+outputAuthorsPath); } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index d8eb49013..2a8d6d6de 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -122,14 +122,14 @@ public class SummariesDecompressor { } } - if ((counter % 1000) == 0) { + if ((counter % 100000) == 0) { logger.info("Current xml records parsed: "+counter); } } } } catch (IOException e) { logger.error("Parsing record from gzip archive: "+e.getMessage()); - e.printStackTrace(); + throw new RuntimeException(e); } logger.info("Summaries parse completed"); logger.info("Total XML records parsed: "+counter); diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json new file mode 100644 index 000000000..bf992b508 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json @@ -0,0 +1,6 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml index 5fb25a696..36e0cdf1b 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/orcid.oozie_app/workflow.xml @@ -29,10 +29,10 @@ ${jobTracker} ${nameNode} eu.dnetlib.doiboost.orcid.OrcidDSManager - - -t${workingPath}/input/crossref/index_dump + -d${workingPath}/ -n${nameNode} - + -fORCID_2019_summaries.tar.gz + -ooutput/