package eu.dnetlib.data.collector.plugins; import java.io.*; import java.net.URL; import java.util.Iterator; import java.util.Set; import com.google.common.collect.Iterators; import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin; import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; import eu.dnetlib.data.collector.rmi.CollectorServiceException; import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentHelper; import org.dom4j.Element; /** * The Class HttpCSVCollectorPlugin. */ public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin { private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class); public static final String UTF8_BOM = "\uFEFF"; /** * The Class HTTPCSVIterator. */ class HTTPCSVIterator implements Iterable { /** The descriptor. */ private InterfaceDescriptor descriptor; /** * Instantiates a new HTTPCSV iterator. * * @param descriptor * the descriptor */ public HTTPCSVIterator(final InterfaceDescriptor descriptor) { this.descriptor = descriptor; } /** * Iterator. * * @return the iterator */ @SuppressWarnings("resource") @Override public Iterator iterator() { try { final String separator = descriptor.getParams().get("separator"); final String identifier = descriptor.getParams().get("identifier"); final String quote = descriptor.getParams().get("quote"); final URL url = new URL(descriptor.getBaseUrl()); long nLines = 0; // FIX // This code should skip the lines with invalid quotes final File tempFile = File.createTempFile("csv-", ".tmp"); try (InputStream is = url.openConnection().getInputStream(); BOMInputStream bomIs = new BOMInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs)); FileWriter fw = new FileWriter(tempFile)) { String line; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) { fw.write(line); fw.write("\n"); nLines++; } } } // END FIX final CSVFormat format = CSVFormat.EXCEL .withHeader() .withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0)) .withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0)) .withTrim(); final CSVParser parser = new CSVParser(new FileReader(tempFile), format); final Set headers = parser.getHeaderMap().keySet(); final long nRecords = nLines - 1; return Iterators.transform(parser.iterator(), input -> { try { final Document document = DocumentHelper.createDocument(); final Element root = document.addElement("csvRecord"); for (final String key : headers) { final Element row = root.addElement("column"); row.addAttribute("name", key).addText(XmlCleaner.cleanAllEntities(input.get(key))); if (key.equals(identifier)) { row.addAttribute("isID", "true"); } } return document.asXML(); } finally { log.debug(tempFile.getAbsolutePath()); if (parser.getRecordNumber() == nRecords) { log.debug("DELETING " + tempFile.getAbsolutePath()); tempFile.delete(); } } }); } catch (final Exception e) { log.error("Error iterating csv lines", e); return null; } } } /* * (non-Javadoc) * * @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String, * java.lang.String) */ @Override public Iterable collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException { return new HTTPCSVIterator(descriptor); } public boolean verifyQuotes(final String line, final char separator) { final char[] cs = line.trim().toCharArray(); boolean inField = false; boolean skipNext = false; for (int i = 0; i < cs.length; i++) { if (skipNext) { skipNext = false; } else if (inField) { if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) { inField = false; } else if ((cs[i] == '\"') && (i < (cs.length - 1))) { if ((cs[i + 1] == '\"')) { skipNext = true; } else { log.warn("Skipped invalid line: " + line); return false; } } } else { if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) { inField = true; } } } if (inField) { log.warn("Skipped invalid line: " + line); return false; } return true; } }