diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 74f31cf357..b32039e327 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -112,6 +112,16 @@
eu.dnetlib.dhp
dhp-schemas
+
+
+ org.apache.commons
+ commons-csv
+ 1.8
+
+
+
+
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java
index 8bdce903b1..1d839bec53 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java
@@ -16,10 +16,15 @@ import org.apache.commons.lang.reflect.FieldUtils;
public class CSVParser {
public List parse(String csvFile, String classForName)
+ throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException {
+ return parse(csvFile, classForName, ';');
+ }
+
+ public List parse(String csvFile, String classForName, char delimiter)
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException {
final CSVFormat format = CSVFormat.EXCEL
.withHeader()
- .withDelimiter(';')
+ .withDelimiter(delimiter)
.withQuote('"')
.withTrim();
List ret = new ArrayList<>();
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
index c73f7ec3d1..f9118350f5 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java
@@ -6,6 +6,7 @@ import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
+import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
@@ -29,6 +30,7 @@ public class ReadCSV implements Closeable {
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final String csvFile;
+ private final char delimiter;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@@ -44,19 +46,23 @@ public class ReadCSV implements Closeable {
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode");
final String classForName = parser.get("classForName");
-
- try (final ReadCSV readCSV = new ReadCSV(hdfsPath, hdfsNameNode, fileURL)) {
+ Optional delimiter = Optional.ofNullable(parser.get("delimiter"));
+ char del = ';';
+ if (delimiter.isPresent())
+ del = delimiter.get().charAt(0);
+ try (final ReadCSV readCSV = new ReadCSV(hdfsPath, hdfsNameNode, fileURL, del)) {
log.info("Getting CSV file...");
readCSV.execute(classForName);
}
+
}
public void execute(final String classForName) throws Exception {
CSVParser csvParser = new CSVParser();
csvParser
- .parse(csvFile, classForName)
+ .parse(csvFile, classForName, delimiter)
.stream()
.forEach(p -> write(p));
@@ -70,7 +76,8 @@ public class ReadCSV implements Closeable {
public ReadCSV(
final String hdfsPath,
final String hdfsNameNode,
- final String fileURL)
+ final String fileURL,
+ char delimiter)
throws Exception {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
@@ -85,6 +92,7 @@ public class ReadCSV implements Closeable {
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.csvFile = httpConnector.getInputSource(fileURL);
+ this.delimiter = delimiter;
}
protected void write(final Object p) {
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java
index 5ce0a681cf..a13d9b791a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java
@@ -31,7 +31,7 @@ public class ReadExcel implements Closeable {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
- ReadCSV.class
+ ReadExcel.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json
index b6c9c94b94..9ccb70a9f9 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json
@@ -28,6 +28,11 @@
"paramLongName" : "sheetName",
"paramDescription" : "the name of the sheet in case the file is excel",
"paramRequired" : false
+}, {
+ "paramName": "d",
+ "paramLongName" : "delimiter",
+ "paramDescription" : "the delimiter between fields in case it is not ;",
+ "paramRequired" : false
}
diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml
index f496ea9a29..ea8832754d 100644
--- a/dhp-workflows/dhp-doiboost/pom.xml
+++ b/dhp-workflows/dhp-doiboost/pom.xml
@@ -82,7 +82,12 @@
org.apache.commons
commons-text
-
+
+ eu.dnetlib.dhp
+ dhp-aggregation
+ 1.2.4-SNAPSHOT
+ compile
+
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/GetCSV.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/GetCSV.java
new file mode 100644
index 0000000000..00b6b184ba
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/GetCSV.java
@@ -0,0 +1,37 @@
+
+package eu.dnetlib.doiboost;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class GetCSV {
+ private static final Log log = LogFactory.getLog(eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV.class);
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils
+ .toString(
+ GetCSV.class
+ .getResourceAsStream(
+ "/eu/dnetlib/dhp/doiboost/download_unibi_issn_gold_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String fileURL = parser.get("fileURL");
+ final String hdfsPath = parser.get("hdfsPath");
+ final String hdfsNameNode = parser.get("hdfsNameNode");
+ final String classForName = parser.get("classForName");
+
+ try (final ReadCSV readCSV = new ReadCSV(hdfsPath, hdfsNameNode, fileURL, ',')) {
+
+ log.info("Getting CSV file...");
+ readCSV.execute(classForName);
+
+ }
+ }
+
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/UnibiGoldModel.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/UnibiGoldModel.java
new file mode 100644
index 0000000000..e5bd49adae
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/UnibiGoldModel.java
@@ -0,0 +1,151 @@
+
+package eu.dnetlib.doiboost;
+
+import java.io.Serializable;
+
+public class UnibiGoldModel implements Serializable {
+ private String ISSN;
+ private String ISSN_L;
+ private String ISSN_IN_DOAJ;
+ private String ISSN_IN_ROAD;
+ private String ISSN_IN_PMC;
+ private String ISSN_IN_OAPC;
+ private String ISSN_IN_WOS;
+ private String ISSN_IN_SCOPUS;
+ private String JOURNAL_IN_DOAJ;
+ private String JOURNAL_IN_ROAD;
+ private String JOURNAL_IN_PMC;
+ private String JOURNAL_IN_OAPC;
+ private String JOURNAL_IN_WOS;
+ private String JOURNAL_IN_SCOPUS;
+ private String TITLE;
+ private String TITLE_SOURCE;
+
+ public String getISSN() {
+ return ISSN;
+ }
+
+ public void setISSN(String ISSN) {
+ this.ISSN = ISSN;
+ }
+
+ public String getISSN_L() {
+ return ISSN_L;
+ }
+
+ public void setISSN_L(String ISSN_L) {
+ this.ISSN_L = ISSN_L;
+ }
+
+ public String getISSN_IN_DOAJ() {
+ return ISSN_IN_DOAJ;
+ }
+
+ public void setISSN_IN_DOAJ(String ISSN_IN_DOAJ) {
+ this.ISSN_IN_DOAJ = ISSN_IN_DOAJ;
+ }
+
+ public String getISSN_IN_ROAD() {
+ return ISSN_IN_ROAD;
+ }
+
+ public void setISSN_IN_ROAD(String ISSN_IN_ROAD) {
+ this.ISSN_IN_ROAD = ISSN_IN_ROAD;
+ }
+
+ public String getISSN_IN_PMC() {
+ return ISSN_IN_PMC;
+ }
+
+ public void setISSN_IN_PMC(String ISSN_IN_PMC) {
+ this.ISSN_IN_PMC = ISSN_IN_PMC;
+ }
+
+ public String getISSN_IN_OAPC() {
+ return ISSN_IN_OAPC;
+ }
+
+ public void setISSN_IN_OAPC(String ISSN_IN_OAPC) {
+ this.ISSN_IN_OAPC = ISSN_IN_OAPC;
+ }
+
+ public String getISSN_IN_WOS() {
+ return ISSN_IN_WOS;
+ }
+
+ public void setISSN_IN_WOS(String ISSN_IN_WOS) {
+ this.ISSN_IN_WOS = ISSN_IN_WOS;
+ }
+
+ public String getISSN_IN_SCOPUS() {
+ return ISSN_IN_SCOPUS;
+ }
+
+ public void setISSN_IN_SCOPUS(String ISSN_IN_SCOPUS) {
+ this.ISSN_IN_SCOPUS = ISSN_IN_SCOPUS;
+ }
+
+ public String getJOURNAL_IN_DOAJ() {
+ return JOURNAL_IN_DOAJ;
+ }
+
+ public void setJOURNAL_IN_DOAJ(String JOURNAL_IN_DOAJ) {
+ this.JOURNAL_IN_DOAJ = JOURNAL_IN_DOAJ;
+ }
+
+ public String getJOURNAL_IN_ROAD() {
+ return JOURNAL_IN_ROAD;
+ }
+
+ public void setJOURNAL_IN_ROAD(String JOURNAL_IN_ROAD) {
+ this.JOURNAL_IN_ROAD = JOURNAL_IN_ROAD;
+ }
+
+ public String getJOURNAL_IN_PMC() {
+ return JOURNAL_IN_PMC;
+ }
+
+ public void setJOURNAL_IN_PMC(String JOURNAL_IN_PMC) {
+ this.JOURNAL_IN_PMC = JOURNAL_IN_PMC;
+ }
+
+ public String getJOURNAL_IN_OAPC() {
+ return JOURNAL_IN_OAPC;
+ }
+
+ public void setJOURNAL_IN_OAPC(String JOURNAL_IN_OAPC) {
+ this.JOURNAL_IN_OAPC = JOURNAL_IN_OAPC;
+ }
+
+ public String getJOURNAL_IN_WOS() {
+ return JOURNAL_IN_WOS;
+ }
+
+ public void setJOURNAL_IN_WOS(String JOURNAL_IN_WOS) {
+ this.JOURNAL_IN_WOS = JOURNAL_IN_WOS;
+ }
+
+ public String getJOURNAL_IN_SCOPUS() {
+ return JOURNAL_IN_SCOPUS;
+ }
+
+ public void setJOURNAL_IN_SCOPUS(String JOURNAL_IN_SCOPUS) {
+ this.JOURNAL_IN_SCOPUS = JOURNAL_IN_SCOPUS;
+ }
+
+ public String getTITLE() {
+ return TITLE;
+ }
+
+ public void setTITLE(String TITLE) {
+ this.TITLE = TITLE;
+ }
+
+ public String getTITLE_SOURCE() {
+ return TITLE_SOURCE;
+ }
+
+ public void setTITLE_SOURCE(String TITLE_SOURCE) {
+ this.TITLE_SOURCE = TITLE_SOURCE;
+ }
+}
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_unibi_issn_gold_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_unibi_issn_gold_parameters.json
new file mode 100644
index 0000000000..9ccb70a9f9
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/download_unibi_issn_gold_parameters.json
@@ -0,0 +1,39 @@
+[
+
+ {
+ "paramName": "fu",
+ "paramLongName" : "fileURL",
+ "paramDescription" : "the url of the file to download",
+ "paramRequired" : true
+ },
+ {
+ "paramName": "hp",
+ "paramLongName" : "hdfsPath",
+ "paramDescription" : "where to save the file",
+ "paramRequired" : true
+ },
+ {
+ "paramName": "hnn",
+ "paramLongName" : "hdfsNameNode",
+ "paramDescription" : "the name node",
+ "paramRequired" : true
+ },
+ {
+ "paramName": "cfn",
+ "paramLongName" : "classForName",
+ "paramDescription" : "the name of the class to deserialize the csv to",
+ "paramRequired" : true
+}, {
+ "paramName": "sn",
+ "paramLongName" : "sheetName",
+ "paramDescription" : "the name of the sheet in case the file is excel",
+ "paramRequired" : false
+}, {
+ "paramName": "d",
+ "paramLongName" : "delimiter",
+ "paramDescription" : "the delimiter between fields in case it is not ;",
+ "paramRequired" : false
+}
+
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
index 52f958d4d9..4de1a21854 100644
--- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/preprocess/oozie_app/workflow.xml
@@ -63,6 +63,7 @@
+ ${wf:conf('resumeFrom') eq 'DownloadGoldIssn'}
${wf:conf('resumeFrom') eq 'UnpackCrossrefEntries'}
${wf:conf('resumeFrom') eq 'GenerateCrossrefDataset'}
${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}
@@ -76,6 +77,19 @@
Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+ eu.dnetlib.doiboost.GetCSV
+ --hdfsNameNode${nameNode}
+ --fileURL${unibiGoldIssnFileURL}
+ --hdfsPath${hdfsPath}
+ --classForNameeu.dnetlib.doiboost.UnibiGoldModel
+
+
+
+
+
+
${jobTracker}
diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/GetCSVTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/GetCSVTest.java
new file mode 100644
index 0000000000..6cfc90736e
--- /dev/null
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/GetCSVTest.java
@@ -0,0 +1,43 @@
+
+package eu.dnetlib.dhp.doiboost;
+
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.project.utils.CSVParser;
+
+public class GetCSVTest {
+
+ @Test
+ public void readUnibiGoldTest() throws Exception {
+
+ String programmecsv = IOUtils
+ .toString(
+ getClass()
+ .getClassLoader()
+ .getResourceAsStream("eu/dnetlib/dhp/doiboost/issn_gold_oa_version_4.csv"));
+
+ CSVParser csvParser = new CSVParser();
+
+ List