diff --git a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSCollectorPlugin.java b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSCollectorPlugin.java index 5037d31..c90a052 100644 --- a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSCollectorPlugin.java +++ b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSCollectorPlugin.java @@ -1,10 +1,17 @@ package eu.dnetlib.data.collector.plugins.ariadneplus.ads; +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; import java.net.MalformedURLException; import java.net.URL; +import java.nio.charset.Charset; import java.util.Iterator; +import com.google.common.base.Strings; +import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; +import eu.dnetlib.miscutils.iterators.xml.XMLIterator; +import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -14,12 +21,15 @@ import eu.dnetlib.data.collector.plugins.FileCollectorPlugin; import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; import eu.dnetlib.rmi.data.CollectorServiceException; import eu.dnetlib.rmi.data.InterfaceDescriptor; +import org.apache.jena.atlas.iterator.Iter; +import org.springframework.beans.factory.annotation.Autowired; public class ADSCollectorPlugin extends FileCollectorPlugin { private Iterator recordIterator; - private URL basePath; + @Autowired + private HttpConnector httpConnector; /** The Constant log. */ private static final Log log = LogFactory.getLog(ADSCollectorPlugin.class); @@ -30,38 +40,70 @@ public class ADSCollectorPlugin extends FileCollectorPlugin { final String baseUrl = interfaceDescriptor.getBaseUrl(); if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } - String url = "file://".concat(baseUrl); - try { - basePath = new URL(url); - } catch (MalformedURLException mue) { - log.error("Failed collecting from base url " + url, mue); - throw new CollectorServiceException(mue); - } - - File baseDir = new File(basePath.getPath()); - if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); } - - log.debug("Start collecting from folder " + baseDir + " ..."); - final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); - - boolean emptyIterator = true; - while (fsi.hasNext()) { - String nextFilePath = fsi.next(); - interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath)); + String srf = interfaceDescriptor.getParams().get("singleRemoteFile"); + boolean singleRemoteFile = Boolean.valueOf(srf); + log.debug("bool singleRemoteFile? "+singleRemoteFile); + if(!singleRemoteFile) { + String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl); + URL basePath; try { - log.debug("Add iterator from " + nextFilePath); - if (emptyIterator) { - recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null); - emptyIterator = false; - } - else { - recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null)); - } - } catch (CollectorServiceException e) { - log.error("Failed collecting from path: " + nextFilePath, e); + basePath = new URL(url); + } catch (MalformedURLException mue) { + log.error("Failed collecting from base url " + url, mue); + throw new CollectorServiceException(mue); } + + File baseDir = new File(basePath.getPath()); + if (!baseDir.exists()) { + throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath())); + } + + log.debug("Start collecting from folder " + baseDir + " ..."); + final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); + + boolean emptyIterator = true; + while (fsi.hasNext()) { + String nextFilePath = fsi.next(); + interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath)); + try { + log.debug("Add iterator from " + nextFilePath); + if (emptyIterator) { + recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator()); + emptyIterator = false; + } else { + recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator())); + } + } catch (CollectorServiceException e) { + log.error("Failed collecting from path: " + nextFilePath, e); + } + } + return new ADSIterable(recordIterator); + } + else { + //singleRemoteFile + return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement")); } - return new ADSIterable(recordIterator); } - + + public Iterable collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement) throws CollectorServiceException { + final String xml = httpConnector.getInputSource(baseUrl); + BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8"))); + return new ADSIterable(new ADSIterator(new XMLIterator(splitOnElement, bis))); + } + + public Iterator getRecordIterator() { + return recordIterator; + } + + public void setRecordIterator(Iterator recordIterator) { + this.recordIterator = recordIterator; + } + + public HttpConnector getHttpConnector() { + return httpConnector; + } + + public void setHttpConnector(HttpConnector httpConnector) { + this.httpConnector = httpConnector; + } } diff --git a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSIterator.java b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSIterator.java index 699d136..aa6f533 100644 --- a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSIterator.java +++ b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ads/ADSIterator.java @@ -4,6 +4,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Iterator; +import com.google.common.collect.Lists; import com.ximpleware.*; import eu.dnetlib.data.collector.ThreadSafeIterator; import eu.dnetlib.rmi.data.CollectorServiceRuntimeException; @@ -13,27 +14,26 @@ import org.apache.commons.logging.LogFactory; public class ADSIterator extends ThreadSafeIterator { private static final Log log = LogFactory.getLog(ADSIterator.class); - private Iterator identifiers; - private String baseUrl; - private String suffix; + + private Iterator iterator; - public ADSIterator(final Iterator idIterator, final String baseUrl, final String suffix){ - this.identifiers = idIterator; - this.baseUrl = baseUrl; - this.suffix = suffix; + public ADSIterator(final Iterator recordIterator){ + this.iterator = recordIterator; } @Override public boolean doHasNext() { - return identifiers.hasNext(); + return iterator.hasNext(); } @Override public String doNext() { - String record = identifiers.next(); + String record = iterator.next(); try { return addADSNamespace(record); } catch (Exception e) { + log.warn("Skipping record because of exception "+e); + log.debug("Skipped record: "+record); if(this.hasNext()){ return this.next(); } @@ -66,27 +66,12 @@ public class ADSIterator extends ThreadSafeIterator { } } - public Iterator getIdentifiers() { - return identifiers; + public Iterator getIterator() { + return iterator; } - public void setIdentifiers(final Iterator identifiers) { - this.identifiers = identifiers; + public void setIterator(final Iterator iterator) { + this.iterator = iterator; } - public String getBaseUrl() { - return baseUrl; - } - - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } - - public String getSuffix() { - return suffix; - } - - public void setSuffix(final String suffix) { - this.suffix = suffix; - } } diff --git a/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml b/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml index cd1d103..86c6034 100644 --- a/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml +++ b/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml @@ -12,6 +12,8 @@ +