diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVParser.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProgramme.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/CSVProject.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java new file mode 100644 index 000000000..fbbd92b9b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -0,0 +1,66 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.lang.reflect.FieldUtils; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +public class EXCELParser { + + public List parse(InputStream file, String classForName) + throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, + InvalidFormatException { + + // OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); + OPCPackage pkg = OPCPackage.open(file); + XSSFWorkbook wb = new XSSFWorkbook(pkg); + + XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); + + List ret = new ArrayList<>(); + + DataFormatter dataFormatter = new DataFormatter(); + Iterator rowIterator = sheet.rowIterator(); + List headers = new ArrayList<>(); + int count = 0; + while (rowIterator.hasNext()) { + Row row = rowIterator.next(); + + if (count == 0) { + Iterator cellIterator = row.cellIterator(); + + while (cellIterator.hasNext()) { + Cell cell = cellIterator.next(); + headers.add(dataFormatter.formatCellValue(cell)); + } + } else { + Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); + final Object cc = clazz.newInstance(); + + for (int i = 0; i < headers.size(); i++) { + Cell cell = row.getCell(i); + FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); + + } + + ret.add((R) cc); + } + + count += 1; + } + + return ret; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java new file mode 100644 index 000000000..cc317e037 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java @@ -0,0 +1,124 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.Serializable; + +public class EXCELTopic implements Serializable { + private String rcn; + private String language; + private String code; + private String parentProgramme; + private String frameworkProgramme; + private String startDate; + private String endDate; + private String title; + private String shortTitle; + private String objective; + private String subjects; + private String legalBasis; + private String call; + + public String getRcn() { + return rcn; + } + + public void setRcn(String rcn) { + this.rcn = rcn; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getParentProgramme() { + return parentProgramme; + } + + public void setParentProgramme(String parentProgramme) { + this.parentProgramme = parentProgramme; + } + + public String getFrameworkProgramme() { + return frameworkProgramme; + } + + public void setFrameworkProgramme(String frameworkProgramme) { + this.frameworkProgramme = frameworkProgramme; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getShortTitle() { + return shortTitle; + } + + public void setShortTitle(String shortTitle) { + this.shortTitle = shortTitle; + } + + public String getObjective() { + return objective; + } + + public void setObjective(String objective) { + this.objective = objective; + } + + public String getSubjects() { + return subjects; + } + + public void setSubjects(String subjects) { + this.subjects = subjects; + } + + public String getLegalBasis() { + return legalBasis; + } + + public void setLegalBasis(String legalBasis) { + this.legalBasis = legalBasis; + } + + public String getCall() { + return call; + } + + public void setCall(String call) { + this.call = call; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/csvutils/ReadCSV.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java new file mode 100644 index 000000000..d65b433a6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils; + +import java.io.*; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class ReadExcel implements Closeable { + private static final Log log = LogFactory.getLog(ReadCSV.class); + private final Configuration conf; + private final BufferedWriter writer; + private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private InputStream excelFile; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ReadCSV.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/project/parameters.json"))); + + parser.parseArgument(args); + + final String fileURL = parser.get("fileURL"); + final String hdfsPath = parser.get("hdfsPath"); + final String hdfsNameNode = parser.get("hdfsNameNode"); + final String classForName = parser.get("classForName"); + + try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { + + log.info("Getting Excel file..."); + readExcel.execute(classForName); + + } + } + + public void execute(final String classForName) throws Exception { + EXCELParser excelParser = new EXCELParser(); + excelParser + .parse(excelFile, classForName) + .stream() + .forEach(p -> write(p)); + + } + + @Override + public void close() throws IOException { + writer.close(); + } + + public ReadExcel( + final String hdfsPath, + final String hdfsNameNode, + final String fileURL) + throws Exception { + this.conf = new Configuration(); + this.conf.set("fs.defaultFS", hdfsNameNode); + HttpConnector httpConnector = new HttpConnector(); + FileSystem fileSystem = FileSystem.get(this.conf); + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, false); + } + fsDataOutputStream = fileSystem.create(hdfsWritePath); + + this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + this.excelFile = httpConnector.getInputSourceAsStream(fileURL); + ; + } + + protected void write(final Object p) { + try { + writer.write(OBJECT_MAPPER.writeValueAsString(p)); + writer.newLine(); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java new file mode 100644 index 000000000..11ee0eef1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -0,0 +1,86 @@ + +package eu.dnetlib.dhp.actionmanager.project; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; +import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; +import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; + +public class EXCELParserTest { + + private static Path workingDir; + private HttpConnector httpConnector = new HttpConnector(); + private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName()); + + } + + @Test + public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException, + IllegalAccessException, InstantiationException { + + EXCELParser excelParser = new EXCELParser(); + + List pl = excelParser + .parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"); + + System.out.println(pl.size()); + +// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); +// XSSFWorkbook wb = new XSSFWorkbook(pkg); +// +// XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); +// +// DataFormatter dataFormatter = new DataFormatter(); +// Iterator rowIterator = sheet.rowIterator(); +// List headers = new ArrayList<>(); +// int count = 0; +// while (rowIterator.hasNext() && count <= 10) { +// Row row = rowIterator.next(); +// +// +// if(count == 0){ +// // Now let's iterate over the columns of the current row +// Iterator cellIterator = row.cellIterator(); +// +// while(cellIterator.hasNext()){ +// Cell cell = cellIterator.next(); +// headers.add(dataFormatter.formatCellValue(cell)); +// } +// }else{ +// Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); +// final Object cc = clazz.newInstance(); +// +// for(int i =0; i iterator = wb.sheetIterator(); +//// System.out.println("Retrieving Sheets using Iterator"); +//// while (iterator.hasNext()) { +//// Sheet sheet = iterator.next(); +//// System.out.println("=> " + sheet.getSheetName()); +//// } +// +// pkg.close(); + } +}