2020-05-19 18:42:50 +02:00
|
|
|
|
2020-09-28 12:05:47 +02:00
|
|
|
package eu.dnetlib.dhp.actionmanager.project.utils;
|
2020-05-19 18:42:50 +02:00
|
|
|
|
2021-08-12 18:03:41 +02:00
|
|
|
import java.io.*;
|
2021-07-22 12:01:48 +02:00
|
|
|
import java.util.Optional;
|
2020-05-19 18:42:50 +02:00
|
|
|
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
2021-08-12 18:03:41 +02:00
|
|
|
import eu.dnetlib.dhp.common.collection.GetCSV;
|
|
|
|
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
2020-05-19 18:42:50 +02:00
|
|
|
|
2020-10-05 11:39:55 +02:00
|
|
|
/**
|
|
|
|
* Applies the parsing of a csv file and writes the Serialization of it in hdfs
|
|
|
|
*/
|
2021-08-12 18:03:41 +02:00
|
|
|
public class ReadCSV {
|
2020-05-19 18:42:50 +02:00
|
|
|
|
|
|
|
public static void main(final String[] args) throws Exception {
|
|
|
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
|
|
IOUtils
|
|
|
|
.toString(
|
|
|
|
ReadCSV.class
|
|
|
|
.getResourceAsStream(
|
|
|
|
"/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
|
|
|
|
|
|
|
|
parser.parseArgument(args);
|
|
|
|
|
|
|
|
final String fileURL = parser.get("fileURL");
|
|
|
|
final String hdfsPath = parser.get("hdfsPath");
|
|
|
|
final String hdfsNameNode = parser.get("hdfsNameNode");
|
|
|
|
final String classForName = parser.get("classForName");
|
2021-07-22 12:01:48 +02:00
|
|
|
Optional<String> delimiter = Optional.ofNullable(parser.get("delimiter"));
|
|
|
|
char del = ';';
|
|
|
|
if (delimiter.isPresent())
|
|
|
|
del = delimiter.get().charAt(0);
|
|
|
|
|
2021-08-11 12:13:22 +02:00
|
|
|
Configuration conf = new Configuration();
|
|
|
|
conf.set("fs.defaultFS", hdfsNameNode);
|
2021-08-12 18:03:41 +02:00
|
|
|
|
2021-08-11 12:13:22 +02:00
|
|
|
FileSystem fileSystem = FileSystem.get(conf);
|
2021-08-12 18:03:41 +02:00
|
|
|
BufferedReader reader = new BufferedReader(
|
|
|
|
new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)));
|
2021-08-11 12:13:22 +02:00
|
|
|
|
2021-08-12 18:03:41 +02:00
|
|
|
GetCSV.getCsv(fileSystem, reader, hdfsPath, classForName, del);
|
2020-05-19 18:42:50 +02:00
|
|
|
|
2021-08-12 18:03:41 +02:00
|
|
|
reader.close();
|
2020-05-19 18:42:50 +02:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|