257 lines
7.6 KiB
Java
257 lines
7.6 KiB
Java
package eu.dnetlib.data.collector.plugins.excel;
|
|
|
|
/**
|
|
* Created by miriam on 10/05/2017.
|
|
*/
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.IOException;
|
|
import java.net.URL;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
|
|
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
|
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.apache.poi.ss.usermodel.Cell;
|
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
|
import org.apache.poi.ss.usermodel.Row;
|
|
import org.apache.poi.ss.usermodel.Sheet;
|
|
import org.apache.poi.ss.usermodel.Workbook;
|
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
import org.json.*;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
|
|
public class Read {
|
|
|
|
private static final Log log = LogFactory.getLog(Read.class);
|
|
|
|
/** The descriptor. */
|
|
private InterfaceDescriptor descriptor;
|
|
|
|
|
|
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
|
|
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
|
|
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
|
|
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
|
|
+ "\"col_currency\":10}"; */
|
|
private Sheet sheet;
|
|
private CSVFileWriter csv_writer = new CSVFileWriter();
|
|
private HashMap<String,String> map_header = new HashMap<String,String>();
|
|
private HashMap<String,String> map_body = new HashMap<String,String>();
|
|
private int header_row;
|
|
private String file_to_save ;
|
|
private boolean replace_currency = false;
|
|
private String from_currency, to_currency;
|
|
private boolean remove_empty, remove_tmp_file;
|
|
private String remove_id;
|
|
private int column_id;
|
|
private int currency_column;
|
|
private int sheet_number;
|
|
private String tmp_file;
|
|
private String argument;
|
|
private String identifier;
|
|
|
|
private HttpCSVCollectorPlugin collector;
|
|
|
|
public HttpCSVCollectorPlugin getCollector() {
|
|
return collector;
|
|
}
|
|
|
|
public void setCollector(HttpCSVCollectorPlugin collector) {
|
|
this.collector = collector;
|
|
}
|
|
|
|
public Read(InterfaceDescriptor descriptor){
|
|
this.descriptor = descriptor;
|
|
|
|
}
|
|
|
|
private static String getCellValue( Cell cell)
|
|
{
|
|
DataFormatter formatter = new DataFormatter();
|
|
String formattedCellValue = formatter.formatCellValue(cell);
|
|
return formattedCellValue;
|
|
|
|
}
|
|
|
|
private void copyFile() throws IOException{
|
|
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
|
|
|
|
}
|
|
|
|
private void parseDescriptor(){
|
|
HashMap<String, String> params = descriptor.getParams();
|
|
argument = params.get("argument");
|
|
header_row = Integer.parseInt(params.get("header_row"));
|
|
tmp_file = params.get("tmp_file");
|
|
remove_empty = (params.get("remove_empty_lines") == "yes");
|
|
remove_id = params.get("remove_lines_with_id");
|
|
column_id = Integer.parseInt(params.get("col_id"));
|
|
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
|
|
sheet_number = Integer.parseInt(params.get("sheet_number"));
|
|
file_to_save = params.get("file_to_save");
|
|
}
|
|
private void init() throws IOException{
|
|
parseDescriptor();
|
|
log.info("Parsing the arguments");
|
|
parseArguments();
|
|
log.info("Copying the file in temp local file");
|
|
copyFile();
|
|
log.info("Extracting the sheet " + sheet_number);
|
|
FileInputStream fis = new FileInputStream(tmp_file);
|
|
Workbook workbook = new XSSFWorkbook(fis);
|
|
sheet = workbook.getSheetAt(sheet_number);
|
|
fis.close();
|
|
if(remove_tmp_file) {
|
|
File f = new File(tmp_file);
|
|
f.delete();
|
|
}
|
|
|
|
}
|
|
|
|
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
|
|
try{
|
|
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
|
for(Object entry: arr)
|
|
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
|
|
}catch(Throwable e){
|
|
log.error("Problems filling the map for " + elem);
|
|
throw(e);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void parseArguments() {
|
|
if (StringUtils.isNotEmpty(argument)){
|
|
try{
|
|
final JSONObject json = new JSONObject(argument);
|
|
if(json.has("header"))
|
|
fillMap(json, map_header,"header");
|
|
if (json.has("body"))
|
|
fillMap(json,map_body,"body");
|
|
|
|
if(json.has("replace_currency"))
|
|
{
|
|
replace_currency = true ;
|
|
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
|
|
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
|
|
|
|
}
|
|
|
|
if (json.has("col_currency"))
|
|
currency_column = json.getInt("col_currency");
|
|
}catch(Throwable e){
|
|
log.error("Problems while parsing the argument parameter.");
|
|
throw (e);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
private String applyReplace(String row, HashMap<String,String>replace){
|
|
for(String key: replace.keySet()){
|
|
if(row.contains(key))
|
|
row = row.replace(key, replace.get(key));
|
|
}
|
|
return row;
|
|
}
|
|
|
|
private void getHeader(){
|
|
Row row = sheet.getRow(header_row);
|
|
Iterator<Cell> cellIterator = row.cellIterator();
|
|
Cell cell;
|
|
String project = "";
|
|
int count = 0;
|
|
while (cellIterator.hasNext()){
|
|
cell = cellIterator.next();
|
|
final String stringCellValue = cell.getStringCellValue();
|
|
project += applyReplace(stringCellValue,map_header) + ";";
|
|
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
|
|
}
|
|
project = project.substring(0, project.length() -1 );
|
|
csv_writer.setHeader(project.split(";"));
|
|
|
|
}
|
|
|
|
private void getData(){
|
|
Row row;
|
|
Cell cell;
|
|
String tmp;
|
|
Iterator<Cell>cellIterator;
|
|
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
|
|
row = sheet.getRow(row_number);
|
|
if (row != null) {
|
|
cellIterator = row.cellIterator();
|
|
|
|
int col_number = 0;
|
|
|
|
boolean discard_row = false;
|
|
ArrayList<String> al = new ArrayList<String>();
|
|
while (cellIterator.hasNext() && !discard_row) {
|
|
cell = cellIterator.next();
|
|
tmp = getCellValue(cell).trim();
|
|
tmp = tmp.replace("\n"," ");
|
|
if (col_number == column_id &&
|
|
((remove_empty && tmp.trim().equals("")) ||
|
|
(!remove_id.equals("") && tmp.equals(remove_id))))
|
|
discard_row = true;
|
|
|
|
if (replace_currency && col_number == currency_column)
|
|
tmp = tmp.replace(from_currency, to_currency);
|
|
|
|
al.add(applyReplace(tmp, map_body));
|
|
col_number++;
|
|
}
|
|
if (!discard_row) {
|
|
csv_writer.addProject(al);
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
private void writeCSVFile(){
|
|
|
|
csv_writer.writeFile(file_to_save);
|
|
}
|
|
|
|
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
|
|
InterfaceDescriptor dex = new InterfaceDescriptor();
|
|
dex.setBaseUrl("file://"+file_to_save);
|
|
HashMap<String, String> params = new HashMap<String, String>();
|
|
params.put("separator", descriptor.getParams().get("separator"));
|
|
params.put("identifier",identifier);
|
|
params.put("quote",descriptor.getParams().get("quote"));
|
|
dex.setParams(params);
|
|
return dex;
|
|
}
|
|
|
|
public Iterable<String> parseFile() throws Exception{
|
|
|
|
|
|
init();
|
|
log.info("Getting header elements");
|
|
getHeader();
|
|
log.info("Getting sheet data");
|
|
getData();
|
|
log.info("Writing the csv file");
|
|
writeCSVFile();
|
|
log.info("Preparing to parse csv");
|
|
|
|
return collector.collect(prepareHTTPCSVDescriptor(),"","");
|
|
|
|
}
|
|
|
|
|
|
}
|