package eu.dnetlib.data.collector.plugins.filesystem; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.util.Iterator; import java.util.List; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.ximpleware.*; import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; import eu.dnetlib.data.collector.rmi.CollectorServiceException; import eu.dnetlib.data.collector.rmi.InterfaceDescriptor; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.json.JSONObject; import org.json.XML; /** * The Class FilesystemIterable. * * @author Sandro, Michele, Andrea */ public class FilesystemIterable implements Iterable { /** * The Constant log. */ private static final Log log = LogFactory.getLog(FilesystemIterable.class); /** * The base dir. */ private File baseDir; /** * The extensions. */ private String extensions; /** * File format (json / xml) **/ private String fileFormat = "xml"; private List supportedFormats = Lists.newArrayList("xml", "json"); private boolean setObjIdentifierFromFileName = false; /** * Instantiates a new filesystem iterable. * * @param descriptor the descriptor * @throws CollectorServiceException the collector service exception */ public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException { try { final String baseUrl = descriptor.getBaseUrl(); URL basePath = new URL(baseUrl); this.baseDir = new File(basePath.getPath()); if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); } this.extensions = descriptor.getParams().get("extensions"); if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat"); if (!supportedFormats.contains(fileFormat)) throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils .join(supportedFormats, ',')); if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) { setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName")); } } catch (MalformedURLException e) { throw new CollectorServiceException("Filesystem collector failed! ", e); } } /** * {@inheritDoc} * * @see java.lang.Iterable#iterator() */ @Override public Iterator iterator() { final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions); return Iterators.transform(fsi, inputFileName -> { FileInputStream fileInputStream = null; try { fileInputStream = new FileInputStream(inputFileName); final String s = IOUtils.toString(fileInputStream); if (fileFormat.equalsIgnoreCase("json")) { JSONObject json = new JSONObject(s); JSONObject obj = new JSONObject(); if (setObjIdentifierFromFileName) { obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName))); } obj.put("metadata", json); log.debug(obj.toString()); return XML.toString(obj, "record"); } String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s); if (setObjIdentifierFromFileName) { return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName)); } else return cleanedXML; } catch (VTDException e) { log.error("Cannot process with VTD to set the objIdentifier " + inputFileName); return ""; } catch (Exception e) { log.error("Unable to read " + inputFileName); return ""; } finally { if (fileInputStream != null) { try { fileInputStream.close(); } catch (IOException e) { log.error("Unable to close inputstream for " + inputFileName); } } } }); } private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException { VTDGen vg = new VTDGen(); // Instantiate VTDGen XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier vg.setDoc(xml.getBytes("UTF-8")); vg.parse(false); VTDNav vn = vg.getNav(); xm.bind(vn); if (vn.toElement(VTDNav.ROOT)) { xm.insertBeforeElement("
" + objidentifier + "
"); xm.insertAfterElement("
"); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); xm.output(baos); return baos.toString("UTF-8"); } }