package eu.dnetlib.data.collector.plugins.ariadneplus; import com.google.common.collect.Iterators; import eu.dnetlib.data.collector.plugins.FileCollectorPlugin; import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; import eu.dnetlib.miscutils.iterators.xml.XMLIterator; import eu.dnetlib.rmi.data.CollectorServiceException; import eu.dnetlib.rmi.data.InterfaceDescriptor; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.springframework.beans.factory.annotation.Autowired; import java.io.BufferedInputStream; import java.io.File; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.Iterator; public class XMLsFolderCollectorPlugin extends FileCollectorPlugin { private Iterator recordIterator; @Autowired private HttpConnector httpConnector; /** The Constant log. */ private static final Log log = LogFactory.getLog(eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin.class); @Override public Iterable collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException { final String baseUrl = interfaceDescriptor.getBaseUrl(); if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } String srf = interfaceDescriptor.getParams().get("singleRemoteFile"); boolean singleRemoteFile = Boolean.valueOf(srf); log.debug("bool singleRemoteFile? "+singleRemoteFile); final String namespaceList = interfaceDescriptor.getParams().get("namespaceList"); if (namespaceList == null || namespaceList.isEmpty()) { throw new CollectorServiceException("Param 'namespaceList' is null or empty"); } if(!singleRemoteFile) { String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl); URL basePath; try { basePath = new URL(url); } catch (MalformedURLException mue) { log.error("Failed collecting from base url " + url, mue); throw new CollectorServiceException(mue); } File baseDir = new File(basePath.getPath()); if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath())); } log.debug("Start collecting from folder " + baseDir + " ..."); final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); boolean emptyIterator = true; while (fsi.hasNext()) { String nextFilePath = fsi.next(); interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath)); try { log.debug("Add iterator from " + nextFilePath); if (emptyIterator) { recordIterator = new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList); emptyIterator = false; } else { recordIterator = Iterators.concat(recordIterator, new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList)); } } catch (CollectorServiceException e) { log.error("Failed collecting from path: " + nextFilePath, e); } } return new XMLsFolderIterable(recordIterator); } else { //singleRemoteFile return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"), namespaceList); } } public Iterable collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement, final String namespaceList) throws CollectorServiceException { final String xml = httpConnector.getInputSource(baseUrl); BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8"))); return new XMLsFolderIterable(new XMLsFolderIterator(new XMLIterator(splitOnElement, bis), namespaceList)); } public Iterator getRecordIterator() { return recordIterator; } public void setRecordIterator(Iterator recordIterator) { this.recordIterator = recordIterator; } public HttpConnector getHttpConnector() { return httpConnector; } public void setHttpConnector(HttpConnector httpConnector) { this.httpConnector = httpConnector; } }