Adding H2020 Classification, topic code and topic description to H2020 projects #46

Merged
claudio.atzori merged 59 commits from miriam.baglioni/dnet-hadoop:h2020classification into master 2020-10-05 14:14:39 +02:00
8 changed files with 370 additions and 0 deletions
Showing only changes of commit f4739a371a - Show all commits

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.reflect.FieldUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
public class EXCELParser {
public <R> List<R> parse(InputStream file, String classForName)
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
InvalidFormatException {
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
OPCPackage pkg = OPCPackage.open(file);
XSSFWorkbook wb = new XSSFWorkbook(pkg);
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
List<R> ret = new ArrayList<>();
DataFormatter dataFormatter = new DataFormatter();
Iterator<Row> rowIterator = sheet.rowIterator();
List<String> headers = new ArrayList<>();
int count = 0;
while (rowIterator.hasNext()) {
Row row = rowIterator.next();
if (count == 0) {
Iterator<Cell> cellIterator = row.cellIterator();
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
headers.add(dataFormatter.formatCellValue(cell));
}
} else {
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
final Object cc = clazz.newInstance();
for (int i = 0; i < headers.size(); i++) {
Cell cell = row.getCell(i);
FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true);
}
ret.add((R) cc);
}
count += 1;
}
return ret;
}
}

View File

@ -0,0 +1,124 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.Serializable;
public class EXCELTopic implements Serializable {
private String rcn;
private String language;
private String code;
private String parentProgramme;
private String frameworkProgramme;
private String startDate;
private String endDate;
private String title;
private String shortTitle;
private String objective;
private String subjects;
private String legalBasis;
private String call;
public String getRcn() {
return rcn;
}
public void setRcn(String rcn) {
this.rcn = rcn;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getParentProgramme() {
return parentProgramme;
}
public void setParentProgramme(String parentProgramme) {
this.parentProgramme = parentProgramme;
}
public String getFrameworkProgramme() {
return frameworkProgramme;
}
public void setFrameworkProgramme(String frameworkProgramme) {
this.frameworkProgramme = frameworkProgramme;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getShortTitle() {
return shortTitle;
}
public void setShortTitle(String shortTitle) {
this.shortTitle = shortTitle;
}
public String getObjective() {
return objective;
}
public void setObjective(String objective) {
this.objective = objective;
}
public String getSubjects() {
return subjects;
}
public void setSubjects(String subjects) {
this.subjects = subjects;
}
public String getLegalBasis() {
return legalBasis;
}
public void setLegalBasis(String legalBasis) {
this.legalBasis = legalBasis;
}
public String getCall() {
return call;
}
public void setCall(String call) {
this.call = call;
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class ReadExcel implements Closeable {
private static final Log log = LogFactory.getLog(ReadCSV.class);
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private InputStream excelFile;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ReadCSV.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
parser.parseArgument(args);
final String fileURL = parser.get("fileURL");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode");
final String classForName = parser.get("classForName");
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
log.info("Getting Excel file...");
readExcel.execute(classForName);
}
}
public void execute(final String classForName) throws Exception {
EXCELParser excelParser = new EXCELParser();
excelParser
.parse(excelFile, classForName)
.stream()
.forEach(p -> write(p));
}
@Override
public void close() throws IOException {
writer.close();
}
public ReadExcel(
final String hdfsPath,
final String hdfsNameNode,
final String fileURL)
throws Exception {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector httpConnector = new HttpConnector();
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
;

cleanup needed :)

cleanup needed :)
}
protected void write(final Object p) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(p));
writer.newLine();
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,86 @@
package eu.dnetlib.dhp.actionmanager.project;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
public class EXCELParserTest {
private static Path workingDir;
private HttpConnector httpConnector = new HttpConnector();
private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx";
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(CSVParserTest.class.getSimpleName());
}
@Test
public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException,
IllegalAccessException, InstantiationException {
EXCELParser excelParser = new EXCELParser();
List<Object> pl = excelParser
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic");
System.out.println(pl.size());
// OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL));
// XSSFWorkbook wb = new XSSFWorkbook(pkg);
//
// XSSFSheet sheet = wb.getSheet("cordisref-H2020topics");
//
// DataFormatter dataFormatter = new DataFormatter();
// Iterator<Row> rowIterator = sheet.rowIterator();
// List<String> headers = new ArrayList<>();
// int count = 0;
// while (rowIterator.hasNext() && count <= 10) {
// Row row = rowIterator.next();
//
//
// if(count == 0){
// // Now let's iterate over the columns of the current row
// Iterator<Cell> cellIterator = row.cellIterator();
//
// while(cellIterator.hasNext()){
// Cell cell = cellIterator.next();
// headers.add(dataFormatter.formatCellValue(cell));
// }
// }else{
// Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic");
// final Object cc = clazz.newInstance();
//
// for(int i =0; i<headers.size(); i++){
// Cell cell = row.getCell(i);
// FieldUtils.writeField(cc, headers.get(i),dataFormatter.formatCellValue(cell), true);
//
// }
//
// System.out.println(new Gson().toJson(cc));
// }
//
// count += 1;
// }
////
//// Iterator<org.apache.poi.ss.usermodel.Sheet> iterator = wb.sheetIterator();
//// System.out.println("Retrieving Sheets using Iterator");
//// while (iterator.hasNext()) {
//// Sheet sheet = iterator.next();
//// System.out.println("=> " + sheet.getSheetName());
//// }
//
// pkg.close();
}
}