forked from D-Net/dnet-hadoop
code to get the information related to the topic association between code and description.
This commit is contained in:
parent
7b6a7333e6
commit
f4739a371a
|
@ -0,0 +1,66 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang.reflect.FieldUtils;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
|
||||
public class EXCELParser {
|
||||
|
||||
public <R> List<R> parse(InputStream file, String classForName)
|
||||
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
|
||||
InvalidFormatException {
|
||||
|
||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
||||
OPCPackage pkg = OPCPackage.open(file);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
|
||||
|
||||
List<R> ret = new ArrayList<>();
|
||||
|
||||
DataFormatter dataFormatter = new DataFormatter();
|
||||
Iterator<Row> rowIterator = sheet.rowIterator();
|
||||
List<String> headers = new ArrayList<>();
|
||||
int count = 0;
|
||||
while (rowIterator.hasNext()) {
|
||||
Row row = rowIterator.next();
|
||||
|
||||
if (count == 0) {
|
||||
Iterator<Cell> cellIterator = row.cellIterator();
|
||||
|
||||
while (cellIterator.hasNext()) {
|
||||
Cell cell = cellIterator.next();
|
||||
headers.add(dataFormatter.formatCellValue(cell));
|
||||
}
|
||||
} else {
|
||||
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
|
||||
final Object cc = clazz.newInstance();
|
||||
|
||||
for (int i = 0; i < headers.size(); i++) {
|
||||
Cell cell = row.getCell(i);
|
||||
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
|
||||
|
||||
}
|
||||
|
||||
ret.add((R) cc);
|
||||
}
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class EXCELTopic implements Serializable {
|
||||
private String rcn;
|
||||
private String language;
|
||||
private String code;
|
||||
private String parentProgramme;
|
||||
private String frameworkProgramme;
|
||||
private String startDate;
|
||||
private String endDate;
|
||||
private String title;
|
||||
private String shortTitle;
|
||||
private String objective;
|
||||
private String subjects;
|
||||
private String legalBasis;
|
||||
private String call;
|
||||
|
||||
public String getRcn() {
|
||||
return rcn;
|
||||
}
|
||||
|
||||
public void setRcn(String rcn) {
|
||||
this.rcn = rcn;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getParentProgramme() {
|
||||
return parentProgramme;
|
||||
}
|
||||
|
||||
public void setParentProgramme(String parentProgramme) {
|
||||
this.parentProgramme = parentProgramme;
|
||||
}
|
||||
|
||||
public String getFrameworkProgramme() {
|
||||
return frameworkProgramme;
|
||||
}
|
||||
|
||||
public void setFrameworkProgramme(String frameworkProgramme) {
|
||||
this.frameworkProgramme = frameworkProgramme;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getShortTitle() {
|
||||
return shortTitle;
|
||||
}
|
||||
|
||||
public void setShortTitle(String shortTitle) {
|
||||
this.shortTitle = shortTitle;
|
||||
}
|
||||
|
||||
public String getObjective() {
|
||||
return objective;
|
||||
}
|
||||
|
||||
public void setObjective(String objective) {
|
||||
this.objective = objective;
|
||||
}
|
||||
|
||||
public String getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(String subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public String getLegalBasis() {
|
||||
return legalBasis;
|
||||
}
|
||||
|
||||
public void setLegalBasis(String legalBasis) {
|
||||
this.legalBasis = legalBasis;
|
||||
}
|
||||
|
||||
public String getCall() {
|
||||
return call;
|
||||
}
|
||||
|
||||
public void setCall(String call) {
|
||||
this.call = call;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class ReadExcel implements Closeable {
|
||||
private static final Log log = LogFactory.getLog(ReadCSV.class);
|
||||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private InputStream excelFile;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
ReadCSV.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String fileURL = parser.get("fileURL");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String classForName = parser.get("classForName");
|
||||
|
||||
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
|
||||
|
||||
log.info("Getting Excel file...");
|
||||
readExcel.execute(classForName);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void execute(final String classForName) throws Exception {
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
excelParser
|
||||
.parse(excelFile, classForName)
|
||||
.stream()
|
||||
.forEach(p -> write(p));
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public ReadExcel(
|
||||
final String hdfsPath,
|
||||
final String hdfsNameNode,
|
||||
final String fileURL)
|
||||
throws Exception {
|
||||
this.conf = new Configuration();
|
||||
this.conf.set("fs.defaultFS", hdfsNameNode);
|
||||
HttpConnector httpConnector = new HttpConnector();
|
||||
FileSystem fileSystem = FileSystem.get(this.conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, false);
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
|
||||
;
|
||||
}
|
||||
|
||||
protected void write(final Object p) {
|
||||
try {
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(p));
|
||||
writer.newLine();
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.project;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
|
||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
||||
|
||||
public class EXCELParserTest {
|
||||
|
||||
private static Path workingDir;
|
||||
private HttpConnector httpConnector = new HttpConnector();
|
||||
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException,
|
||||
IllegalAccessException, InstantiationException {
|
||||
|
||||
EXCELParser excelParser = new EXCELParser();
|
||||
|
||||
List<Object> pl = excelParser
|
||||
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic");
|
||||
|
||||
System.out.println(pl.size());
|
||||
|
||||
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
|
||||
// XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
//
|
||||
// XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
|
||||
//
|
||||
// DataFormatter dataFormatter = new DataFormatter();
|
||||
// Iterator<Row> rowIterator = sheet.rowIterator();
|
||||
// List<String> headers = new ArrayList<>();
|
||||
// int count = 0;
|
||||
// while (rowIterator.hasNext() && count <= 10) {
|
||||
// Row row = rowIterator.next();
|
||||
//
|
||||
//
|
||||
// if(count == 0){
|
||||
// // Now let's iterate over the columns of the current row
|
||||
// Iterator<Cell> cellIterator = row.cellIterator();
|
||||
//
|
||||
// while(cellIterator.hasNext()){
|
||||
// Cell cell = cellIterator.next();
|
||||
// headers.add(dataFormatter.formatCellValue(cell));
|
||||
// }
|
||||
// }else{
|
||||
// Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
|
||||
// final Object cc = clazz.newInstance();
|
||||
//
|
||||
// for(int i =0; i<headers.size(); i++){
|
||||
// Cell cell = row.getCell(i);
|
||||
// FieldUtils.writeField(cc, headers.get(i),dataFormatter.formatCellValue(cell), true);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// System.out.println(new Gson().toJson(cc));
|
||||
// }
|
||||
//
|
||||
// count += 1;
|
||||
// }
|
||||
////
|
||||
//// Iterator<org.apache.poi.ss.usermodel.Sheet> iterator = wb.sheetIterator();
|
||||
//// System.out.println("Retrieving Sheets using Iterator");
|
||||
//// while (iterator.hasNext()) {
|
||||
//// Sheet sheet = iterator.next();
|
||||
//// System.out.println("=> " + sheet.getSheetName());
|
||||
//// }
|
||||
//
|
||||
// pkg.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue