package eu.dnetlib.data.collector.plugins.ariadneplus.ads; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.Iterator; import com.google.common.base.Strings; import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; import eu.dnetlib.miscutils.iterators.xml.XMLIterator; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.google.common.collect.Iterators; import eu.dnetlib.data.collector.plugins.FileCollectorPlugin; import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; import eu.dnetlib.rmi.data.CollectorServiceException; import eu.dnetlib.rmi.data.InterfaceDescriptor; import org.apache.jena.atlas.iterator.Iter; import org.springframework.beans.factory.annotation.Autowired; public class ADSCollectorPlugin extends FileCollectorPlugin { private Iterator recordIterator; @Autowired private HttpConnector httpConnector; /** The Constant log. */ private static final Log log = LogFactory.getLog(ADSCollectorPlugin.class); @Override public Iterable collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException { final String baseUrl = interfaceDescriptor.getBaseUrl(); if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } String srf = interfaceDescriptor.getParams().get("singleRemoteFile"); boolean singleRemoteFile = Boolean.valueOf(srf); log.debug("bool singleRemoteFile? "+singleRemoteFile); if(!singleRemoteFile) { String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl); URL basePath; try { basePath = new URL(url); } catch (MalformedURLException mue) { log.error("Failed collecting from base url " + url, mue); throw new CollectorServiceException(mue); } File baseDir = new File(basePath.getPath()); if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath())); } log.debug("Start collecting from folder " + baseDir + " ..."); final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); boolean emptyIterator = true; while (fsi.hasNext()) { String nextFilePath = fsi.next(); interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath)); try { log.debug("Add iterator from " + nextFilePath); if (emptyIterator) { recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator()); emptyIterator = false; } else { recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator())); } } catch (CollectorServiceException e) { log.error("Failed collecting from path: " + nextFilePath, e); } } return new ADSIterable(recordIterator); } else { //singleRemoteFile return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement")); } } public Iterable collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement) throws CollectorServiceException { final String xml = httpConnector.getInputSource(baseUrl); BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8"))); return new ADSIterable(new ADSIterator(new XMLIterator(splitOnElement, bis))); } public Iterator getRecordIterator() { return recordIterator; } public void setRecordIterator(Iterator recordIterator) { this.recordIterator = recordIterator; } public HttpConnector getHttpConnector() { return httpConnector; } public void setHttpConnector(HttpConnector httpConnector) { this.httpConnector = httpConnector; } }