ADS plugin now able to process one single remote file

This commit is contained in:
Alessia Bardi 2020-06-05 15:20:21 +02:00
parent b220b9de4e
commit e5cfbd01af
3 changed files with 88 additions and 59 deletions

View File

@ -1,10 +1,17 @@
package eu.dnetlib.data.collector.plugins.ariadneplus.ads; package eu.dnetlib.data.collector.plugins.ariadneplus.ads;
import java.io.BufferedInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator; import java.util.Iterator;
import com.google.common.base.Strings;
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -14,12 +21,15 @@ import eu.dnetlib.data.collector.plugins.FileCollectorPlugin;
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator; import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
import eu.dnetlib.rmi.data.CollectorServiceException; import eu.dnetlib.rmi.data.CollectorServiceException;
import eu.dnetlib.rmi.data.InterfaceDescriptor; import eu.dnetlib.rmi.data.InterfaceDescriptor;
import org.apache.jena.atlas.iterator.Iter;
import org.springframework.beans.factory.annotation.Autowired;
public class ADSCollectorPlugin extends FileCollectorPlugin { public class ADSCollectorPlugin extends FileCollectorPlugin {
private Iterator<String> recordIterator; private Iterator<String> recordIterator;
private URL basePath; @Autowired
private HttpConnector httpConnector;
/** The Constant log. */ /** The Constant log. */
private static final Log log = LogFactory.getLog(ADSCollectorPlugin.class); private static final Log log = LogFactory.getLog(ADSCollectorPlugin.class);
@ -30,7 +40,12 @@ public class ADSCollectorPlugin extends FileCollectorPlugin {
final String baseUrl = interfaceDescriptor.getBaseUrl(); final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); } if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
String url = "file://".concat(baseUrl); String srf = interfaceDescriptor.getParams().get("singleRemoteFile");
boolean singleRemoteFile = Boolean.valueOf(srf);
log.debug("bool singleRemoteFile? "+singleRemoteFile);
if(!singleRemoteFile) {
String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl);
URL basePath;
try { try {
basePath = new URL(url); basePath = new URL(url);
} catch (MalformedURLException mue) { } catch (MalformedURLException mue) {
@ -39,7 +54,9 @@ public class ADSCollectorPlugin extends FileCollectorPlugin {
} }
File baseDir = new File(basePath.getPath()); File baseDir = new File(basePath.getPath());
if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); } if (!baseDir.exists()) {
throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath()));
}
log.debug("Start collecting from folder " + baseDir + " ..."); log.debug("Start collecting from folder " + baseDir + " ...");
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml"); final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
@ -51,11 +68,10 @@ public class ADSCollectorPlugin extends FileCollectorPlugin {
try { try {
log.debug("Add iterator from " + nextFilePath); log.debug("Add iterator from " + nextFilePath);
if (emptyIterator) { if (emptyIterator) {
recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null); recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator());
emptyIterator = false; emptyIterator = false;
} } else {
else { recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator()));
recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null));
} }
} catch (CollectorServiceException e) { } catch (CollectorServiceException e) {
log.error("Failed collecting from path: " + nextFilePath, e); log.error("Failed collecting from path: " + nextFilePath, e);
@ -63,5 +79,31 @@ public class ADSCollectorPlugin extends FileCollectorPlugin {
} }
return new ADSIterable(recordIterator); return new ADSIterable(recordIterator);
} }
else {
//singleRemoteFile
return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"));
}
}
public Iterable<String> collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement) throws CollectorServiceException {
final String xml = httpConnector.getInputSource(baseUrl);
BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8")));
return new ADSIterable(new ADSIterator(new XMLIterator(splitOnElement, bis)));
}
public Iterator<String> getRecordIterator() {
return recordIterator;
}
public void setRecordIterator(Iterator<String> recordIterator) {
this.recordIterator = recordIterator;
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
public void setHttpConnector(HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
} }

View File

@ -4,6 +4,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import com.google.common.collect.Lists;
import com.ximpleware.*; import com.ximpleware.*;
import eu.dnetlib.data.collector.ThreadSafeIterator; import eu.dnetlib.data.collector.ThreadSafeIterator;
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException; import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
@ -13,27 +14,26 @@ import org.apache.commons.logging.LogFactory;
public class ADSIterator extends ThreadSafeIterator { public class ADSIterator extends ThreadSafeIterator {
private static final Log log = LogFactory.getLog(ADSIterator.class); private static final Log log = LogFactory.getLog(ADSIterator.class);
private Iterator<String> identifiers;
private String baseUrl;
private String suffix;
public ADSIterator(final Iterator<String> idIterator, final String baseUrl, final String suffix){ private Iterator<String> iterator;
this.identifiers = idIterator;
this.baseUrl = baseUrl; public ADSIterator(final Iterator<String> recordIterator){
this.suffix = suffix; this.iterator = recordIterator;
} }
@Override @Override
public boolean doHasNext() { public boolean doHasNext() {
return identifiers.hasNext(); return iterator.hasNext();
} }
@Override @Override
public String doNext() { public String doNext() {
String record = identifiers.next(); String record = iterator.next();
try { try {
return addADSNamespace(record); return addADSNamespace(record);
} catch (Exception e) { } catch (Exception e) {
log.warn("Skipping record because of exception "+e);
log.debug("Skipped record: "+record);
if(this.hasNext()){ if(this.hasNext()){
return this.next(); return this.next();
} }
@ -66,27 +66,12 @@ public class ADSIterator extends ThreadSafeIterator {
} }
} }
public Iterator<String> getIdentifiers() { public Iterator<String> getIterator() {
return identifiers; return iterator;
} }
public void setIdentifiers(final Iterator<String> identifiers) { public void setIterator(final Iterator<String> iterator) {
this.identifiers = identifiers; this.iterator = iterator;
} }
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(final String baseUrl) {
this.baseUrl = baseUrl;
}
public String getSuffix() {
return suffix;
}
public void setSuffix(final String suffix) {
this.suffix = suffix;
}
} }

View File

@ -12,6 +12,8 @@
<list> <list>
<bean class="eu.dnetlib.rmi.data.ProtocolParameter" <bean class="eu.dnetlib.rmi.data.ProtocolParameter"
p:name="splitOnElement"/> p:name="splitOnElement"/>
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
p:name="singleRemoteFile" p:type="BOOLEAN" p:optional="true"/>
</list> </list>
</property> </property>
</bean> </bean>