dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/excel/Read.java

257 lines
7.6 KiB
Java

package eu.dnetlib.data.collector.plugins.excel;
/**
* Created by miriam on 10/05/2017.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.json.*;
import org.apache.commons.io.FileUtils;
public class Read {
private static final Log log = LogFactory.getLog(Read.class);
/** The descriptor. */
private InterfaceDescriptor descriptor;
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
+ "\"col_currency\":10}"; */
private Sheet sheet;
private CSVFileWriter csv_writer = new CSVFileWriter();
private HashMap<String,String> map_header = new HashMap<String,String>();
private HashMap<String,String> map_body = new HashMap<String,String>();
private int header_row;
private String file_to_save ;
private boolean replace_currency = false;
private String from_currency, to_currency;
private boolean remove_empty, remove_tmp_file;
private String remove_id;
private int column_id;
private int currency_column;
private int sheet_number;
private String tmp_file;
private String argument;
private String identifier;
private HttpCSVCollectorPlugin collector;
public HttpCSVCollectorPlugin getCollector() {
return collector;
}
public void setCollector(HttpCSVCollectorPlugin collector) {
this.collector = collector;
}
public Read(InterfaceDescriptor descriptor){
this.descriptor = descriptor;
}
private static String getCellValue( Cell cell)
{
DataFormatter formatter = new DataFormatter();
String formattedCellValue = formatter.formatCellValue(cell);
return formattedCellValue;
}
private void copyFile() throws IOException{
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
}
private void parseDescriptor(){
HashMap<String, String> params = descriptor.getParams();
argument = params.get("argument");
header_row = Integer.parseInt(params.get("header_row"));
tmp_file = params.get("tmp_file");
remove_empty = (params.get("remove_empty_lines") == "yes");
remove_id = params.get("remove_lines_with_id");
column_id = Integer.parseInt(params.get("col_id"));
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
sheet_number = Integer.parseInt(params.get("sheet_number"));
file_to_save = params.get("file_to_save");
}
private void init() throws IOException{
parseDescriptor();
log.info("Parsing the arguments");
parseArguments();
log.info("Copying the file in temp local file");
copyFile();
log.info("Extracting the sheet " + sheet_number);
FileInputStream fis = new FileInputStream(tmp_file);
Workbook workbook = new XSSFWorkbook(fis);
sheet = workbook.getSheetAt(sheet_number);
fis.close();
if(remove_tmp_file) {
File f = new File(tmp_file);
f.delete();
}
}
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
try{
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
for(Object entry: arr)
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
}catch(Throwable e){
log.error("Problems filling the map for " + elem);
throw(e);
}
}
private void parseArguments() {
if (StringUtils.isNotEmpty(argument)){
try{
final JSONObject json = new JSONObject(argument);
if(json.has("header"))
fillMap(json, map_header,"header");
if (json.has("body"))
fillMap(json,map_body,"body");
if(json.has("replace_currency"))
{
replace_currency = true ;
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
}
if (json.has("col_currency"))
currency_column = json.getInt("col_currency");
}catch(Throwable e){
log.error("Problems while parsing the argument parameter.");
throw (e);
}
}
}
private String applyReplace(String row, HashMap<String,String>replace){
for(String key: replace.keySet()){
if(row.contains(key))
row = row.replace(key, replace.get(key));
}
return row;
}
private void getHeader(){
Row row = sheet.getRow(header_row);
Iterator<Cell> cellIterator = row.cellIterator();
Cell cell;
String project = "";
int count = 0;
while (cellIterator.hasNext()){
cell = cellIterator.next();
final String stringCellValue = cell.getStringCellValue();
project += applyReplace(stringCellValue,map_header) + ";";
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
}
project = project.substring(0, project.length() -1 );
csv_writer.setHeader(project.split(";"));
}
private void getData(){
Row row;
Cell cell;
String tmp;
Iterator<Cell>cellIterator;
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
row = sheet.getRow(row_number);
if (row != null) {
cellIterator = row.cellIterator();
int col_number = 0;
boolean discard_row = false;
ArrayList<String> al = new ArrayList<String>();
while (cellIterator.hasNext() && !discard_row) {
cell = cellIterator.next();
tmp = getCellValue(cell).trim();
tmp = tmp.replace("\n"," ");
if (col_number == column_id &&
((remove_empty && tmp.trim().equals("")) ||
(!remove_id.equals("") && tmp.equals(remove_id))))
discard_row = true;
if (replace_currency && col_number == currency_column)
tmp = tmp.replace(from_currency, to_currency);
al.add(applyReplace(tmp, map_body));
col_number++;
}
if (!discard_row) {
csv_writer.addProject(al);
}
}
}
}
private void writeCSVFile(){
csv_writer.writeFile(file_to_save);
}
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
InterfaceDescriptor dex = new InterfaceDescriptor();
dex.setBaseUrl("file://"+file_to_save);
HashMap<String, String> params = new HashMap<String, String>();
params.put("separator", descriptor.getParams().get("separator"));
params.put("identifier",identifier);
params.put("quote",descriptor.getParams().get("quote"));
dex.setParams(params);
return dex;
}
public Iterable<String> parseFile() throws Exception{
init();
log.info("Getting header elements");
getHeader();
log.info("Getting sheet data");
getData();
log.info("Writing the csv file");
writeCSVFile();
log.info("Preparing to parse csv");
return collector.collect(prepareHTTPCSVDescriptor(),"","");
}
}