diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java deleted file mode 100644 index c53cd2127..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVRecord; -import org.apache.commons.lang.reflect.FieldUtils; - -/** - * Reads a generic csv and maps it into classes that mirror its schema - */ -public class CSVParser { - - public List parse(String csvFile, String classForName) - throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException { - return parse(csvFile, classForName, ';'); - } - - public List parse(String csvFile, String classForName, char delimiter) - throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException { - final CSVFormat format = CSVFormat.EXCEL - .withHeader() - .withDelimiter(delimiter) - .withQuote('"') - .withTrim(); - List ret = new ArrayList<>(); - final org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(csvFile, format); - final Set headers = parser.getHeaderMap().keySet(); - Class clazz = Class.forName(classForName); - for (CSVRecord csvRecord : parser.getRecords()) { - - @SuppressWarnings("unchecked") - final R cc = (R) clazz.newInstance(); - for (String header : headers) { - FieldUtils.writeField(cc, header, csvRecord.get(header), true); - - } - ret.add(cc); - } - - return ret; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java deleted file mode 100644 index 268d5f28c..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java +++ /dev/null @@ -1,200 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.Serializable; - -/** - * the mmodel for the projects csv file - */ -public class CSVProject implements Serializable { - private String rcn; - private String id; - private String acronym; - private String status; - private String programme; - private String topics; - private String frameworkProgramme; - private String title; - private String startDate; - private String endDate; - private String projectUrl; - private String objective; - private String totalCost; - private String ecMaxContribution; - private String call; - private String fundingScheme; - private String coordinator; - private String coordinatorCountry; - private String participants; - private String participantCountries; - private String subjects; - - public String getRcn() { - return rcn; - } - - public void setRcn(String rcn) { - this.rcn = rcn; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getAcronym() { - return acronym; - } - - public void setAcronym(String acronym) { - this.acronym = acronym; - } - - public String getStatus() { - return status; - } - - public void setStatus(String status) { - this.status = status; - } - - public String getProgramme() { - return programme; - } - - public void setProgramme(String programme) { - this.programme = programme; - } - - public String getTopics() { - return topics; - } - - public void setTopics(String topics) { - this.topics = topics; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getProjectUrl() { - return projectUrl; - } - - public void setProjectUrl(String projectUrl) { - this.projectUrl = projectUrl; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getTotalCost() { - return totalCost; - } - - public void setTotalCost(String totalCost) { - this.totalCost = totalCost; - } - - public String getEcMaxContribution() { - return ecMaxContribution; - } - - public void setEcMaxContribution(String ecMaxContribution) { - this.ecMaxContribution = ecMaxContribution; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } - - public String getFundingScheme() { - return fundingScheme; - } - - public void setFundingScheme(String fundingScheme) { - this.fundingScheme = fundingScheme; - } - - public String getCoordinator() { - return coordinator; - } - - public void setCoordinator(String coordinator) { - this.coordinator = coordinator; - } - - public String getCoordinatorCountry() { - return coordinatorCountry; - } - - public void setCoordinatorCountry(String coordinatorCountry) { - this.coordinatorCountry = coordinatorCountry; - } - - public String getParticipants() { - return participants; - } - - public void setParticipants(String participants) { - this.participants = participants; - } - - public String getParticipantCountries() { - return participantCountries; - } - - public void setParticipantCountries(String participantCountries) { - this.participantCountries = participantCountries; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java deleted file mode 100644 index dd7e1910f..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/CSVParserTest.java +++ /dev/null @@ -1,31 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project; - -import java.util.List; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import eu.dnetlib.dhp.actionmanager.project.utils.CSVParser; - -class CSVParserTest { - - @Test - void readProgrammeTest() throws Exception { - - String programmecsv = IOUtils - .toString( - getClass() - .getClassLoader() - .getResourceAsStream("eu/dnetlib/dhp/actionmanager/project/programme.csv")); - - CSVParser csvParser = new CSVParser(); - - List pl = csvParser.parse(programmecsv, "eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme"); - - Assertions.assertEquals(24, pl.size()); - - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz deleted file mode 100644 index 620e1abfb..000000000 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz and /dev/null differ diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/GetCSV.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/GetCSV.java index 9516cf6f7..d7d369819 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/GetCSV.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hostedbymap/GetCSV.java @@ -2,27 +2,16 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap; import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.opencsv.bean.CsvToBeanBuilder; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.HttpConnector2; public class GetCSV { - private static final Log log = LogFactory.getLog(eu.dnetlib.dhp.oa.graph.hostedbymap.GetCSV.class); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -38,66 +27,45 @@ public class GetCSV { final String hdfsPath = parser.get("workingPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); final String classForName = parser.get("classForName"); + final String delimiter = Optional + .ofNullable(parser.get("delimiter")) + .orElse(null); final Boolean shouldReplace = Optional .ofNullable((parser.get("replace"))) .map(Boolean::valueOf) .orElse(false); - URLConnection connection = new URL(fileURL).openConnection(); - connection - .setRequestProperty( - "User-Agent", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); - connection.connect(); + char del = ';'; + if (delimiter != null) { + del = delimiter.charAt(0); + } + + HttpConnector2 connector2 = new HttpConnector2(); BufferedReader in = new BufferedReader( - new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8"))); + new InputStreamReader(connector2.getInputSourceAsStream(fileURL))); - if (shouldReplace) { - PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ.csv"))); - String line = null; - while ((line = in.readLine()) != null) { - writer.println(line.replace("\\\"", "\"")); + if (Boolean.TRUE.equals(shouldReplace)) { + try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/replaced.csv")))) { + String line; + while ((line = in.readLine()) != null) { + writer.println(line.replace("\\\"", "\"")); + } } - writer.close(); + in.close(); - in = new BufferedReader(new FileReader("/tmp/DOAJ.csv")); + in = new BufferedReader(new FileReader("/tmp/replaced.csv")); } Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsNameNode); FileSystem fileSystem = FileSystem.get(conf); - Path hdfsWritePath = new Path(hdfsPath); - FSDataOutputStream fsDataOutputStream = null; - if (fileSystem.exists(hdfsWritePath)) { - fileSystem.delete(hdfsWritePath, false); - } - fsDataOutputStream = fileSystem.create(hdfsWritePath); - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + eu.dnetlib.dhp.common.collection.GetCSV.getCsv(fileSystem, in, hdfsPath, classForName, del); - Class clazz = Class.forName(classForName); - - ObjectMapper mapper = new ObjectMapper(); - - new CsvToBeanBuilder(in) - .withType(clazz) - .withMultilineLimit(1) - .build() - .parse() - .forEach(line -> { - try { - writer.write(mapper.writeValueAsString(line)); - writer.newLine(); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - - writer.close(); in.close(); - if (shouldReplace) { + if (Boolean.TRUE.equals(shouldReplace)) { File f = new File("/tmp/DOAJ.csv"); f.delete(); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestReadCSV.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestReadCSV.java deleted file mode 100644 index 89259d814..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/hostedbymap/TestReadCSV.java +++ /dev/null @@ -1,112 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.hostedbymap; - -import java.io.*; -import java.net.URL; -import java.net.URLConnection; -import java.nio.charset.Charset; -import java.util.List; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.opencsv.bean.CsvToBeanBuilder; - -import eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel; - -public class TestReadCSV { - - @Test - public void testCSVUnibi() throws FileNotFoundException { - - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/hostedbymap/unibiGold.csv") - .getPath(); - - List beans = new CsvToBeanBuilder(new FileReader(sourcePath)) - .withType(UnibiGoldModel.class) - .build() - .parse(); - - Assertions.assertEquals(36, beans.size()); - Assertions.assertEquals(1, beans.stream().filter(e -> e.getIssn().equals("0001-625X")).count()); - Assertions - .assertTrue( - beans - .stream() - .anyMatch(e -> e.getIssn().equals("0001-625X") && e.getTitle().equals("Acta Mycologica"))); - Assertions.assertTrue(beans.stream().allMatch(e -> e.getIssn().equals(e.getIssn_l()))); - - } - - @Disabled - @Test - public void testCSVUrlUnibi() throws IOException { - - URL csv = new URL("https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv"); - - BufferedReader in = new BufferedReader(new InputStreamReader(csv.openStream())); - ObjectMapper mapper = new ObjectMapper(); - - new CsvToBeanBuilder(in) - .withType(eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel.class) - .build() - .parse() - .forEach(line -> - - { - try { - System.out.println(mapper.writeValueAsString(line)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - } - - ); - } - - @Disabled - @Test - public void testCSVUrlDOAJ() throws IOException { - - URLConnection connection = new URL("https://doaj.org/csv").openConnection(); - connection - .setRequestProperty( - "User-Agent", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); - connection.connect(); - - BufferedReader in = new BufferedReader( - new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8"))); - // BufferedReader in = new BufferedReader(new FileReader("/tmp/DOAJ.csv")); - PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ_1.csv"))); - String line = null; - while ((line = in.readLine()) != null) { - writer.println(line.replace("\\\"", "\"")); - } - writer.close(); - in.close(); - in = new BufferedReader(new FileReader("/tmp/DOAJ_1.csv")); - ObjectMapper mapper = new ObjectMapper(); - - new CsvToBeanBuilder(in) - .withType(eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel.class) - .withMultilineLimit(1) - .build() - .parse() - .forEach(lline -> - - { - try { - System.out.println(mapper.writeValueAsString(lline)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - } - - ); - } -}