diff --git a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderCollectorPlugin.java b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderCollectorPlugin.java new file mode 100644 index 0000000..432fcaa --- /dev/null +++ b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderCollectorPlugin.java @@ -0,0 +1,105 @@ +package eu.dnetlib.data.collector.plugins.ariadneplus; + +import com.google.common.collect.Iterators; +import eu.dnetlib.data.collector.plugins.FileCollectorPlugin; +import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; +import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector; +import eu.dnetlib.miscutils.iterators.xml.XMLIterator; +import eu.dnetlib.rmi.data.CollectorServiceException; +import eu.dnetlib.rmi.data.InterfaceDescriptor; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.beans.factory.annotation.Autowired; + +import java.io.BufferedInputStream; +import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.Iterator; + +public class XMLsFolderCollectorPlugin extends FileCollectorPlugin { + + private Iterator recordIterator; + @Autowired + private HttpConnector httpConnector; + + /** The Constant log. */ + private static final Log log = LogFactory.getLog(eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin.class); + + @Override + public Iterable collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) + throws CollectorServiceException { + + final String baseUrl = interfaceDescriptor.getBaseUrl(); + if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } + String srf = interfaceDescriptor.getParams().get("singleRemoteFile"); + boolean singleRemoteFile = Boolean.valueOf(srf); + log.debug("bool singleRemoteFile? "+singleRemoteFile); + final String namespaceList = interfaceDescriptor.getParams().get("namespaceList"); + if (namespaceList == null || namespaceList.isEmpty()) { throw new CollectorServiceException("Param 'namespaceList' is null or empty"); } + if(!singleRemoteFile) { + String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl); + URL basePath; + try { + basePath = new URL(url); + } catch (MalformedURLException mue) { + log.error("Failed collecting from base url " + url, mue); + throw new CollectorServiceException(mue); + } + + File baseDir = new File(basePath.getPath()); + if (!baseDir.exists()) { + throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath())); + } + + log.debug("Start collecting from folder " + baseDir + " ..."); + final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); + + boolean emptyIterator = true; + while (fsi.hasNext()) { + String nextFilePath = fsi.next(); + interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath)); + try { + log.debug("Add iterator from " + nextFilePath); + if (emptyIterator) { + recordIterator = new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList); + emptyIterator = false; + } else { + recordIterator = Iterators.concat(recordIterator, new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList)); + } + } catch (CollectorServiceException e) { + log.error("Failed collecting from path: " + nextFilePath, e); + } + } + return new XMLsFolderIterable(recordIterator); + } + else { + //singleRemoteFile + return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"), namespaceList); + } + } + + public Iterable collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement, final String namespaceList) throws CollectorServiceException { + final String xml = httpConnector.getInputSource(baseUrl); + BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8"))); + return new XMLsFolderIterable(new XMLsFolderIterator(new XMLIterator(splitOnElement, bis), namespaceList)); + } + + public Iterator getRecordIterator() { + return recordIterator; + } + + public void setRecordIterator(Iterator recordIterator) { + this.recordIterator = recordIterator; + } + + public HttpConnector getHttpConnector() { + return httpConnector; + } + + public void setHttpConnector(HttpConnector httpConnector) { + this.httpConnector = httpConnector; + } +} diff --git a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterable.java b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterable.java new file mode 100644 index 0000000..f6b4311 --- /dev/null +++ b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterable.java @@ -0,0 +1,30 @@ +package eu.dnetlib.data.collector.plugins.ariadneplus; + +import java.util.Iterator; + +public class XMLsFolderIterable implements Iterable{ + + private Iterator recordIterator; + + public XMLsFolderIterable(Iterator recordIterator ) { + this.recordIterator = recordIterator; + } + + /** + * {@inheritDoc} + * + * @see java.lang.Iterable#iterator() + */ + @Override + public Iterator iterator() { + return recordIterator; + } + + public Iterator getRecordIterator() { + return recordIterator; + } + + public void setRecordIterator(Iterator recordIterator) { + this.recordIterator = recordIterator; + } +} diff --git a/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterator.java b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterator.java new file mode 100644 index 0000000..b757398 --- /dev/null +++ b/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/XMLsFolderIterator.java @@ -0,0 +1,84 @@ +package eu.dnetlib.data.collector.plugins.ariadneplus; + +import com.ximpleware.*; +import eu.dnetlib.data.collector.ThreadSafeIterator; +import eu.dnetlib.rmi.data.CollectorServiceRuntimeException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Iterator; + +public class XMLsFolderIterator extends ThreadSafeIterator { + + private static final Log log = LogFactory.getLog(XMLsFolderIterator.class); + + private Iterator iterator; + private String namespaceList; + + public XMLsFolderIterator(final Iterator recordIterator, final String namespaceList){ + + this.iterator = recordIterator; + this.namespaceList = namespaceList; + } + + @Override + public boolean doHasNext() { + return iterator.hasNext(); + } + + @Override + public String doNext() { + String record = iterator.next(); + try { + return addCustomNamespace(record, getNamespaceList()); + } catch (Exception e) { + log.warn("Skipping record because of exception "+e); + log.debug("Skipped record: "+record); + if(this.hasNext()){ + return this.next(); + } + else return ""; + } + } + + protected String addCustomNamespace(final String xml, String namespaceList) { + + try { + VTDGen vg = new VTDGen(); + vg.setDoc(xml.getBytes()); + vg.parse(false); // namespace unaware to all name space nodes addressable using xpath @* + VTDNav vn = vg.getNav(); + XMLModifier xm = new XMLModifier(vn); + + namespaceList = " ".concat(namespaceList).concat(" "); + byte[] attrBytes = namespaceList.getBytes(); + + vn.toElement(VTDNav.ROOT); + xm.insertAttribute(attrBytes); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + xm.output(baos); + return baos.toString(); + } catch(ParseException | ModifyException | NavException | IOException | TranscodeException e){ + log.error("Cannot add namespace declarations to element: "+xml); + throw new CollectorServiceRuntimeException("Cannot add namespace declarations to element", e); + } + } + + public Iterator getIterator() { + return iterator; + } + + public void setIterator(final Iterator iterator) { + this.iterator = iterator; + } + + public String getNamespaceList() { + return namespaceList; + } + + public void setNamespaceList(String namespaceList) { + this.namespaceList = namespaceList; + } +} diff --git a/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml b/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml index 86c6034..dcb36f2 100644 --- a/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml +++ b/dnet-ariadneplus/src/main/resources/eu/dnetlib/data/collector/plugins/ariadneplus/applicationContext-ariadneplus-collector-plugins.xml @@ -43,4 +43,20 @@ + + + + + + + + + + + + +