From 5f0903d50d6abff7ae5ae98a71b53d0045d62ae8 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 13 Aug 2021 14:17:54 +0200 Subject: [PATCH] fixed CSV downloader & tests --- .../dhp/oa/graph/hostedbymap/DownloadCSV.java | 26 ++++++++---- .../oa/graph/hostedbymap/DownloadCsvTest.java | 42 ++++++++----------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCSV.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCSV.java index c8329c99c..be35d31f8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCSV.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCSV.java @@ -2,9 +2,12 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap; import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Objects; import java.util.Optional; +import eu.dnetlib.dhp.common.collection.CollectorException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -56,30 +59,37 @@ public class DownloadCSV { .orElse(DEFAULT_DELIMITER); log.info("delimiter {}", delimiter); - final HttpConnector2 connector2 = new HttpConnector2(); - Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsNameNode); FileSystem fileSystem = FileSystem.get(conf); + + new DownloadCSV().doDownload(fileURL, workingPath, outputFile, classForName, delimiter, fileSystem); + + } + + protected void doDownload(String fileURL, String workingPath, String outputFile, String classForName, char delimiter, FileSystem fs) + throws IOException, ClassNotFoundException, CollectorException { + + final HttpConnector2 connector2 = new HttpConnector2(); + final Path path = new Path(workingPath + "/replaced.csv"); try (BufferedReader in = new BufferedReader( new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) { - try (FSDataOutputStream fos = fileSystem.create(path, true)) { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true), Charset.defaultCharset()))) { String line; while ((line = in.readLine()) != null) { - fos.writeUTF(line.replace("\\\"", "\"")); - fos.writeUTF("\n"); + writer.write(line.replace("\\\"", "\"")); + writer.newLine(); } } } - try (InputStreamReader reader = new InputStreamReader(fileSystem.open(path))) { - GetCSV.getCsv(fileSystem, reader, outputFile, classForName, delimiter); + try (InputStreamReader reader = new InputStreamReader(fs.open(path))) { + GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter); } - } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java index b93320e1a..7b02025fb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/DownloadCsvTest.java @@ -39,14 +39,16 @@ public class DownloadCsvTest { String fileURL = "https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv"; - GetCSV - .getCsv( - fs, new BufferedReader( - new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))), - workingDir + "/programme", - UnibiGoldModel.class.getName()); + final String outputFile = workingDir + "/unibi_gold.json"; + new DownloadCSV().doDownload( + fileURL, + workingDir + "/unibi_gold", + outputFile, + UnibiGoldModel.class.getName(), + ',', + fs); - BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/programme")))); + BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile)))); String line; int count = 0; @@ -82,24 +84,16 @@ public class DownloadCsvTest { String fileURL = "https://doaj.org/csv"; - try (BufferedReader in = new BufferedReader( - new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)))) { - try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ_1.csv")))) { - String line; - while ((line = in.readLine()) != null) { - writer.println(line.replace("\\\"", "\"")); - } - } - } + final String outputFile = workingDir + "/doaj.json"; + new DownloadCSV().doDownload( + fileURL, + workingDir + "/doaj", + outputFile, + DOAJModel.class.getName(), + ',', + fs); - GetCSV - .getCsv( - fs, new BufferedReader( - new FileReader("/tmp/DOAJ_1.csv")), - workingDir + "/programme", - DOAJModel.class.getName()); - - BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/programme")))); + BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile)))); String line; int count = 0;