dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/filesystem/FilesystemIterable.java

140 lines
4.6 KiB
Java

package eu.dnetlib.data.collector.plugins.filesystem;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.ximpleware.*;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import org.json.XML;
/**
* The Class FilesystemIterable.
*
* @author Sandro, Michele, Andrea
*/
public class FilesystemIterable implements Iterable<String> {
/**
* The Constant log.
*/
private static final Log log = LogFactory.getLog(FilesystemIterable.class);
/**
* The base dir.
*/
private File baseDir;
/**
* The extensions.
*/
private String extensions;
/**
* File format (json / xml)
**/
private String fileFormat = "xml";
private List<String> supportedFormats = Lists.newArrayList("xml", "json");
private boolean setObjIdentifierFromFileName = false;
/**
* Instantiates a new filesystem iterable.
*
* @param descriptor the descriptor
* @throws CollectorServiceException the collector service exception
*/
public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
try {
final String baseUrl = descriptor.getBaseUrl();
URL basePath = new URL(baseUrl);
this.baseDir = new File(basePath.getPath());
if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
this.extensions = descriptor.getParams().get("extensions");
if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
if (!supportedFormats.contains(fileFormat))
throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils
.join(supportedFormats, ','));
if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
}
} catch (MalformedURLException e) {
throw new CollectorServiceException("Filesystem collector failed! ", e);
}
}
/**
* {@inheritDoc}
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
return Iterators.transform(fsi, inputFileName -> {
FileInputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(inputFileName);
final String s = IOUtils.toString(fileInputStream);
if (fileFormat.equalsIgnoreCase("json")) {
JSONObject json = new JSONObject(s);
JSONObject obj = new JSONObject();
if (setObjIdentifierFromFileName) {
obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName)));
}
obj.put("metadata", json);
log.debug(obj.toString());
return XML.toString(obj, "record");
}
String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
if (setObjIdentifierFromFileName) {
return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
} else return cleanedXML;
} catch (VTDException e) {
log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
return "";
} catch (Exception e) {
log.error("Unable to read " + inputFileName);
return "";
} finally {
if (fileInputStream != null) {
try {
fileInputStream.close();
} catch (IOException e) {
log.error("Unable to close inputstream for " + inputFileName);
}
}
}
});
}
private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException {
VTDGen vg = new VTDGen(); // Instantiate VTDGen
XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier
vg.setDoc(xml.getBytes("UTF-8"));
vg.parse(false);
VTDNav vn = vg.getNav();
xm.bind(vn);
if (vn.toElement(VTDNav.ROOT)) {
xm.insertBeforeElement("<record><header><objIdentifier>" + objidentifier + "</objIdentifier></header><metadata>");
xm.insertAfterElement("</metadata></record>");
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
xm.output(baos);
return baos.toString("UTF-8");
}
}