You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
106 lines
4.8 KiB
Java
106 lines
4.8 KiB
Java
package eu.dnetlib.data.collector.plugins.ariadneplus;
|
|
|
|
import com.google.common.collect.Iterators;
|
|
import eu.dnetlib.data.collector.plugins.FileCollectorPlugin;
|
|
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
|
|
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
|
|
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
|
|
import eu.dnetlib.rmi.data.CollectorServiceException;
|
|
import eu.dnetlib.rmi.data.InterfaceDescriptor;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
|
import java.io.BufferedInputStream;
|
|
import java.io.File;
|
|
import java.net.MalformedURLException;
|
|
import java.net.URL;
|
|
import java.nio.charset.Charset;
|
|
import java.util.Iterator;
|
|
|
|
public class XMLsFolderCollectorPlugin extends FileCollectorPlugin {
|
|
|
|
private Iterator<String> recordIterator;
|
|
@Autowired
|
|
private HttpConnector httpConnector;
|
|
|
|
/** The Constant log. */
|
|
private static final Log log = LogFactory.getLog(eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin.class);
|
|
|
|
@Override
|
|
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
|
throws CollectorServiceException {
|
|
|
|
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
|
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
|
String srf = interfaceDescriptor.getParams().get("singleRemoteFile");
|
|
boolean singleRemoteFile = Boolean.valueOf(srf);
|
|
log.debug("bool singleRemoteFile? "+singleRemoteFile);
|
|
final String namespaceList = interfaceDescriptor.getParams().get("namespaceList");
|
|
if (namespaceList == null || namespaceList.isEmpty()) { throw new CollectorServiceException("Param 'namespaceList' is null or empty"); }
|
|
if(!singleRemoteFile) {
|
|
String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl);
|
|
URL basePath;
|
|
try {
|
|
basePath = new URL(url);
|
|
} catch (MalformedURLException mue) {
|
|
log.error("Failed collecting from base url " + url, mue);
|
|
throw new CollectorServiceException(mue);
|
|
}
|
|
|
|
File baseDir = new File(basePath.getPath());
|
|
if (!baseDir.exists()) {
|
|
throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath()));
|
|
}
|
|
|
|
log.debug("Start collecting from folder " + baseDir + " ...");
|
|
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
|
|
|
|
boolean emptyIterator = true;
|
|
while (fsi.hasNext()) {
|
|
String nextFilePath = fsi.next();
|
|
interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath));
|
|
try {
|
|
log.debug("Add iterator from " + nextFilePath);
|
|
if (emptyIterator) {
|
|
recordIterator = new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList);
|
|
emptyIterator = false;
|
|
} else {
|
|
recordIterator = Iterators.concat(recordIterator, new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList));
|
|
}
|
|
} catch (CollectorServiceException e) {
|
|
log.error("Failed collecting from path: " + nextFilePath, e);
|
|
}
|
|
}
|
|
return new XMLsFolderIterable(recordIterator);
|
|
}
|
|
else {
|
|
//singleRemoteFile
|
|
return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"), namespaceList);
|
|
}
|
|
}
|
|
|
|
public Iterable<String> collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement, final String namespaceList) throws CollectorServiceException {
|
|
final String xml = httpConnector.getInputSource(baseUrl);
|
|
BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8")));
|
|
return new XMLsFolderIterable(new XMLsFolderIterator(new XMLIterator(splitOnElement, bis), namespaceList));
|
|
}
|
|
|
|
public Iterator<String> getRecordIterator() {
|
|
return recordIterator;
|
|
}
|
|
|
|
public void setRecordIterator(Iterator<String> recordIterator) {
|
|
this.recordIterator = recordIterator;
|
|
}
|
|
|
|
public HttpConnector getHttpConnector() {
|
|
return httpConnector;
|
|
}
|
|
|
|
public void setHttpConnector(HttpConnector httpConnector) {
|
|
this.httpConnector = httpConnector;
|
|
}
|
|
}
|