package eu.dnetlib.data.collector.plugins.excel; /** * Created by miriam on 10/05/2017. */ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin; import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.json.*; import org.apache.commons.io.FileUtils; public class Read { private static final Log log = LogFactory.getLog(Read.class); /** The descriptor. */ private InterfaceDescriptor descriptor; /*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10"; private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv"; private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," + "\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}]," + "\"col_currency\":10}"; */ private Sheet sheet; private CSVFileWriter csv_writer = new CSVFileWriter(); private HashMap map_header = new HashMap(); private HashMap map_body = new HashMap(); private int header_row; private String file_to_save ; private boolean replace_currency = false; private String from_currency, to_currency; private boolean remove_empty, remove_tmp_file; private String remove_id; private int column_id; private int currency_column; private int sheet_number; private String tmp_file; private String argument; private String identifier; private HttpCSVCollectorPlugin collector; public HttpCSVCollectorPlugin getCollector() { return collector; } public void setCollector(HttpCSVCollectorPlugin collector) { this.collector = collector; } public Read(InterfaceDescriptor descriptor){ this.descriptor = descriptor; } private static String getCellValue( Cell cell) { DataFormatter formatter = new DataFormatter(); String formattedCellValue = formatter.formatCellValue(cell); return formattedCellValue; } private void copyFile() throws IOException{ FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file)); } private void parseDescriptor(){ HashMap params = descriptor.getParams(); argument = params.get("argument"); header_row = Integer.parseInt(params.get("header_row")); tmp_file = params.get("tmp_file"); remove_empty = (params.get("remove_empty_lines") == "yes"); remove_id = params.get("remove_lines_with_id"); column_id = Integer.parseInt(params.get("col_id")); remove_tmp_file = (params.get("remove_tmp_file") == "yes"); sheet_number = Integer.parseInt(params.get("sheet_number")); file_to_save = params.get("file_to_save"); } private void init() throws IOException{ parseDescriptor(); log.info("Parsing the arguments"); parseArguments(); log.info("Copying the file in temp local file"); copyFile(); log.info("Extracting the sheet " + sheet_number); FileInputStream fis = new FileInputStream(tmp_file); Workbook workbook = new XSSFWorkbook(fis); sheet = workbook.getSheetAt(sheet_number); fis.close(); if(remove_tmp_file) { File f = new File(tmp_file); f.delete(); } } private void fillMap(JSONObject json, HashMap map, String elem){ try{ final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem); for(Object entry: arr) map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to")); }catch(Throwable e){ log.error("Problems filling the map for " + elem); throw(e); } } private void parseArguments() { if (StringUtils.isNotEmpty(argument)){ try{ final JSONObject json = new JSONObject(argument); if(json.has("header")) fillMap(json, map_header,"header"); if (json.has("body")) fillMap(json,map_body,"body"); if(json.has("replace_currency")) { replace_currency = true ; from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from"); to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to"); } if (json.has("col_currency")) currency_column = json.getInt("col_currency"); }catch(Throwable e){ log.error("Problems while parsing the argument parameter."); throw (e); } } } private String applyReplace(String row, HashMapreplace){ for(String key: replace.keySet()){ if(row.contains(key)) row = row.replace(key, replace.get(key)); } return row; } private void getHeader(){ Row row = sheet.getRow(header_row); Iterator cellIterator = row.cellIterator(); Cell cell; String project = ""; int count = 0; while (cellIterator.hasNext()){ cell = cellIterator.next(); final String stringCellValue = cell.getStringCellValue(); project += applyReplace(stringCellValue,map_header) + ";"; if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header); } project = project.substring(0, project.length() -1 ); csv_writer.setHeader(project.split(";")); } private void getData(){ Row row; Cell cell; String tmp; IteratorcellIterator; for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){ row = sheet.getRow(row_number); if (row != null) { cellIterator = row.cellIterator(); int col_number = 0; boolean discard_row = false; ArrayList al = new ArrayList(); while (cellIterator.hasNext() && !discard_row) { cell = cellIterator.next(); tmp = getCellValue(cell).trim(); tmp = tmp.replace("\n"," "); if (col_number == column_id && ((remove_empty && tmp.trim().equals("")) || (!remove_id.equals("") && tmp.equals(remove_id)))) discard_row = true; if (replace_currency && col_number == currency_column) tmp = tmp.replace(from_currency, to_currency); al.add(applyReplace(tmp, map_body)); col_number++; } if (!discard_row) { csv_writer.addProject(al); } } } } private void writeCSVFile(){ csv_writer.writeFile(file_to_save); } private InterfaceDescriptor prepareHTTPCSVDescriptor(){ InterfaceDescriptor dex = new InterfaceDescriptor(); dex.setBaseUrl("file://"+file_to_save); HashMap params = new HashMap(); params.put("separator", descriptor.getParams().get("separator")); params.put("identifier",identifier); params.put("quote",descriptor.getParams().get("quote")); dex.setParams(params); return dex; } public Iterable parseFile() throws Exception{ init(); log.info("Getting header elements"); getHeader(); log.info("Getting sheet data"); getData(); log.info("Writing the csv file"); writeCSVFile(); log.info("Preparing to parse csv"); return collector.collect(prepareHTTPCSVDescriptor(),"",""); } }