added DownloadCSV2 as alternative implementation of the same download procedure

This commit is contained in:
Claudio Atzori 2021-08-13 15:52:15 +02:00
parent 5f0903d50d
commit f74adc4752
5 changed files with 214 additions and 113 deletions

View File

@ -7,16 +7,15 @@ import java.nio.charset.StandardCharsets;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.common.collection.CollectorException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.GetCSV; import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2; import eu.dnetlib.dhp.common.collection.HttpConnector2;
@ -68,8 +67,9 @@ public class DownloadCSV {
} }
protected void doDownload(String fileURL, String workingPath, String outputFile, String classForName, char delimiter, FileSystem fs) protected void doDownload(String fileURL, String workingPath, String outputFile, String classForName,
throws IOException, ClassNotFoundException, CollectorException { char delimiter, FileSystem fs)
throws IOException, ClassNotFoundException, CollectorException {
final HttpConnector2 connector2 = new HttpConnector2(); final HttpConnector2 connector2 = new HttpConnector2();
@ -78,11 +78,11 @@ public class DownloadCSV {
try (BufferedReader in = new BufferedReader( try (BufferedReader in = new BufferedReader(
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) { new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true), Charset.defaultCharset()))) { try (PrintWriter writer = new PrintWriter(
new OutputStreamWriter(fs.create(path, true), StandardCharsets.UTF_8))) {
String line; String line;
while ((line = in.readLine()) != null) { while ((line = in.readLine()) != null) {
writer.write(line.replace("\\\"", "\"")); writer.println(line.replace("\\\"", "\""));
writer.newLine();
} }
} }
} }

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap;
import java.io.*;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class DownloadCSV2 {
private static final Logger log = LoggerFactory.getLogger(DownloadCSV2.class);
public static final char DEFAULT_DELIMITER = ';';
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
DownloadCSV2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_csv_parameters.json"))));
parser.parseArgument(args);
final String fileURL = parser.get("fileURL");
log.info("fileURL {}", fileURL);
final String tmpFile = parser.get("tmpFile");
log.info("tmpFile {}", tmpFile);
final String outputFile = parser.get("outputFile");
log.info("outputFile {}", outputFile);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
final String classForName = parser.get("classForName");
log.info("classForName {}", classForName);
final char delimiter = Optional
.ofNullable(parser.get("delimiter"))
.map(s -> s.charAt(0))
.orElse(DEFAULT_DELIMITER);
log.info("delimiter {}", delimiter);
HttpConnector2 connector2 = new HttpConnector2();
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(tmpFile)))) {
String line;
while ((line = in.readLine()) != null) {
writer.println(line.replace("\\\"", "\""));
}
}
}
try (BufferedReader in = new BufferedReader(new FileReader(tmpFile))) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
GetCSV.getCsv(fileSystem, in, outputFile, classForName, delimiter);
} finally {
FileUtils.deleteQuietly(new File(tmpFile));
}
}
}

View File

@ -6,9 +6,9 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName":"wp", "paramName":"tf",
"paramLongName":"workingPath", "paramLongName":"tmpFile",
"paramDescription": "the path where to find the pre-processed data for unibi gold list and doj artciles", "paramDescription": "the temporary local file storing the cleaned CSV contents for unibi gold list and doj artciles",
"paramRequired": true "paramRequired": true
}, },
{ {

View File

@ -73,7 +73,8 @@
<decision name="resume_from"> <decision name="resume_from">
<switch> <switch>
<case to="remove_hbmpath">${wf:conf('resumeFrom') eq 'ProduceHBM'}</case> <case to="produceHBM">${wf:conf('resumeFrom') eq 'ProduceHBM'}</case>
<case to="remove_hbmpath">${wf:conf('resumeFrom') eq 'download_csv'}</case>
<default to="prepareInfo"/> <default to="prepareInfo"/>
</switch> </switch>
</decision> </decision>
@ -98,10 +99,10 @@
<action name="download_gold"> <action name="download_gold">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV</main-class> <main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV2</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${unibiFileURL}</arg> <arg>--fileURL</arg><arg>${unibiFileURL}</arg>
<arg>--workingPath</arg><arg>${workingDir}/unibi_gold</arg> <arg>--tmpFile</arg><arg>/tmp/unibi_gold_replaced.csv</arg>
<arg>--outputFile</arg><arg>${workingDir}/unibi_gold.json</arg> <arg>--outputFile</arg><arg>${workingDir}/unibi_gold.json</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg> <arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
</java> </java>
@ -111,10 +112,10 @@
<action name="download_doaj"> <action name="download_doaj">
<java> <java>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV</main-class> <main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV2</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${doajFileURL}</arg> <arg>--fileURL</arg><arg>${doajFileURL}</arg>
<arg>--workingPath</arg><arg>${workingDir}/doaj</arg> <arg>--tmpFile</arg><arg>/tmp/doaj_replaced.csv</arg>
<arg>--outputFile</arg><arg>${workingDir}/doaj.json</arg> <arg>--outputFile</arg><arg>${workingDir}/doaj.json</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel</arg> <arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel</arg>
</java> </java>
@ -141,7 +142,7 @@
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts> </spark-opts>
<arg>--datasourcePath</arg><arg>${sourcePath}/datasource</arg> <arg>--datasourcePath</arg><arg>${sourcePath}/datasource</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg> <arg>--workingPath</arg><arg>/user/${wf:user()}/data</arg>
<arg>--outputPath</arg><arg>${hostedByMapPath}</arg> <arg>--outputPath</arg><arg>${hostedByMapPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>

View File

@ -1,133 +1,149 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap; package eu.dnetlib.dhp.oa.graph.hostedbymap;
import com.fasterxml.jackson.databind.ObjectMapper; import static org.junit.jupiter.api.Assertions.assertEquals;
import eu.dnetlib.dhp.common.collection.CollectorException; import static org.junit.jupiter.api.Assertions.assertTrue;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2; import java.io.BufferedReader;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel; import java.io.File;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel; import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*; import com.fasterxml.jackson.databind.ObjectMapper;
import java.nio.file.Files;
import static org.junit.jupiter.api.Assertions.assertTrue; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel;
public class DownloadCsvTest { public class DownloadCsvTest {
private static String workingDir; private static final Logger log = LoggerFactory.getLogger(DownloadCsvTest.class);
private static LocalFileSystem fs; private static String workingDir;
@BeforeAll private static LocalFileSystem fs;
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DownloadCsvTest.class.getSimpleName())
.toString();
fs = FileSystem.getLocal(new Configuration()); @BeforeAll
} public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DownloadCsvTest.class.getSimpleName())
.toString();
@Disabled fs = FileSystem.getLocal(new Configuration());
@Test }
void getUnibiFileTest() throws CollectorException, IOException, ClassNotFoundException {
String fileURL = "https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv"; @Disabled
@Test
void getUnibiFileTest() throws CollectorException, IOException, ClassNotFoundException {
final String outputFile = workingDir + "/unibi_gold.json"; String fileURL = "https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv";
new DownloadCSV().doDownload(
fileURL,
workingDir + "/unibi_gold",
outputFile,
UnibiGoldModel.class.getName(),
',',
fs);
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile)))); final String outputFile = workingDir + "/unibi_gold.json";
new DownloadCSV()
.doDownload(
fileURL,
workingDir + "/unibi_gold",
outputFile,
UnibiGoldModel.class.getName(),
',',
fs);
String line; BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile))));
int count = 0;
while ((line = in.readLine()) != null) {
UnibiGoldModel unibi = new ObjectMapper().readValue(line, UnibiGoldModel.class);
if (count == 0) {
assertTrue(unibi.getIssn().equals("0001-625X"));
assertTrue(unibi.getIssnL().equals("0001-625X"));
assertTrue(unibi.getTitle().equals("Acta Mycologica"));
} String line;
if (count == 43158) { int count = 0;
assertTrue(unibi.getIssn().equals("2088-6330")); while ((line = in.readLine()) != null) {
assertTrue(unibi.getIssnL().equals("2088-6330")); UnibiGoldModel unibi = new ObjectMapper().readValue(line, UnibiGoldModel.class);
assertTrue(unibi.getTitle().equals("Religió: Jurnal Studi Agama-agama")); if (count == 0) {
assertTrue(unibi.getIssn().equals("0001-625X"));
assertTrue(unibi.getIssnL().equals("0001-625X"));
assertTrue(unibi.getTitle().equals("Acta Mycologica"));
} }
if (count == 67027) { if (count == 43158) {
assertTrue(unibi.getIssn().equals("2658-7068")); assertTrue(unibi.getIssn().equals("2088-6330"));
assertTrue(unibi.getIssnL().equals("2308-2488")); assertTrue(unibi.getIssnL().equals("2088-6330"));
assertTrue(unibi.getTitle().equals("Istoriko-èkonomičeskie issledovaniâ.")); assertTrue(unibi.getTitle().equals("Religió: Jurnal Studi Agama-agama"));
}
count += 1; }
} if (count == 67027) {
assertTrue(unibi.getIssn().equals("2658-7068"));
assertTrue(unibi.getIssnL().equals("2308-2488"));
assertTrue(unibi.getTitle().equals("Istoriko-èkonomičeskie issledovaniâ."));
}
Assertions.assertEquals(67028, count); count += 1;
} }
@Disabled assertEquals(67028, count);
@Test }
void getDoajFileTest() throws CollectorException, IOException, ClassNotFoundException {
String fileURL = "https://doaj.org/csv"; @Disabled
@Test
void getDoajFileTest() throws CollectorException, IOException, ClassNotFoundException {
final String outputFile = workingDir + "/doaj.json"; String fileURL = "https://doaj.org/csv";
new DownloadCSV().doDownload(
fileURL,
workingDir + "/doaj",
outputFile,
DOAJModel.class.getName(),
',',
fs);
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile)))); final String outputFile = workingDir + "/doaj.json";
new DownloadCSV()
.doDownload(
fileURL,
workingDir + "/doaj",
outputFile,
DOAJModel.class.getName(),
',',
fs);
String line; BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile))));
int count = 0;
while ((line = in.readLine()) != null) {
DOAJModel doaj = new ObjectMapper().readValue(line, DOAJModel.class);
if (count == 0) {
Assertions.assertEquals("0001-3765", doaj.getIssn());
Assertions.assertEquals("1678-2690", doaj.getEissn());
Assertions.assertEquals("Anais da Academia Brasileira de Ciências", doaj.getJournalTitle());
} String line;
if (count == 7904) { int count = 0;
System.out.println(new ObjectMapper().writeValueAsString(doaj)); while ((line = in.readLine()) != null) {
Assertions.assertEquals("", doaj.getIssn()); DOAJModel doaj = new ObjectMapper().readValue(line, DOAJModel.class);
Assertions.assertEquals("2055-7159", doaj.getEissn()); if (count == 0) {
Assertions.assertEquals("BJR|case reports", doaj.getJournalTitle()); assertEquals("0001-3765", doaj.getIssn());
} assertEquals("1678-2690", doaj.getEissn());
if (count == 16707) { assertEquals("Anais da Academia Brasileira de Ciências", doaj.getJournalTitle());
}
if (count == 22) {
log.info(new ObjectMapper().writeValueAsString(doaj));
System.out.println(new ObjectMapper().writeValueAsString(doaj));
}
if (count == 7904) {
// log.info(new ObjectMapper().writeValueAsString(doaj));
assertEquals("", doaj.getIssn());
assertEquals("2055-7159", doaj.getEissn());
assertEquals("BJR|case reports", doaj.getJournalTitle());
}
if (count == 16707) {
Assertions.assertEquals("", doaj.getIssn()); assertEquals("2783-1043", doaj.getIssn());
Assertions.assertEquals("2788-6298", doaj.getEissn()); assertEquals("2783-1051", doaj.getEissn());
Assertions assertEquals("فیزیک کاربردی ایران", doaj.getJournalTitle());
.assertEquals("Teacher Education through Flexible Learning in Africa", doaj.getJournalTitle()); }
}
count += 1; count += 1;
} }
Assertions.assertEquals(16713, count); assertEquals(16715, count);
} }
@AfterAll @AfterAll
public static void cleanup() { public static void cleanup() {
FileUtils.deleteQuietly(new File(workingDir)); FileUtils.deleteQuietly(new File(workingDir));
} }
} }