ADS plugin now able to process one single remote file
This commit is contained in:
parent
b220b9de4e
commit
e5cfbd01af
|
@ -1,10 +1,17 @@
|
|||
package eu.dnetlib.data.collector.plugins.ariadneplus.ads;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
|
||||
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
|
@ -14,12 +21,15 @@ import eu.dnetlib.data.collector.plugins.FileCollectorPlugin;
|
|||
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
|
||||
import eu.dnetlib.rmi.data.CollectorServiceException;
|
||||
import eu.dnetlib.rmi.data.InterfaceDescriptor;
|
||||
import org.apache.jena.atlas.iterator.Iter;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
|
||||
public class ADSCollectorPlugin extends FileCollectorPlugin {
|
||||
|
||||
private Iterator<String> recordIterator;
|
||||
private URL basePath;
|
||||
@Autowired
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
/** The Constant log. */
|
||||
private static final Log log = LogFactory.getLog(ADSCollectorPlugin.class);
|
||||
|
@ -30,38 +40,70 @@ public class ADSCollectorPlugin extends FileCollectorPlugin {
|
|||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
String url = "file://".concat(baseUrl);
|
||||
try {
|
||||
basePath = new URL(url);
|
||||
} catch (MalformedURLException mue) {
|
||||
log.error("Failed collecting from base url " + url, mue);
|
||||
throw new CollectorServiceException(mue);
|
||||
}
|
||||
|
||||
File baseDir = new File(basePath.getPath());
|
||||
if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
|
||||
|
||||
log.debug("Start collecting from folder " + baseDir + " ...");
|
||||
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
|
||||
|
||||
boolean emptyIterator = true;
|
||||
while (fsi.hasNext()) {
|
||||
String nextFilePath = fsi.next();
|
||||
interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath));
|
||||
String srf = interfaceDescriptor.getParams().get("singleRemoteFile");
|
||||
boolean singleRemoteFile = Boolean.valueOf(srf);
|
||||
log.debug("bool singleRemoteFile? "+singleRemoteFile);
|
||||
if(!singleRemoteFile) {
|
||||
String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl);
|
||||
URL basePath;
|
||||
try {
|
||||
log.debug("Add iterator from " + nextFilePath);
|
||||
if (emptyIterator) {
|
||||
recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null);
|
||||
emptyIterator = false;
|
||||
}
|
||||
else {
|
||||
recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), null, null));
|
||||
}
|
||||
} catch (CollectorServiceException e) {
|
||||
log.error("Failed collecting from path: " + nextFilePath, e);
|
||||
basePath = new URL(url);
|
||||
} catch (MalformedURLException mue) {
|
||||
log.error("Failed collecting from base url " + url, mue);
|
||||
throw new CollectorServiceException(mue);
|
||||
}
|
||||
|
||||
File baseDir = new File(basePath.getPath());
|
||||
if (!baseDir.exists()) {
|
||||
throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath()));
|
||||
}
|
||||
|
||||
log.debug("Start collecting from folder " + baseDir + " ...");
|
||||
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
|
||||
|
||||
boolean emptyIterator = true;
|
||||
while (fsi.hasNext()) {
|
||||
String nextFilePath = fsi.next();
|
||||
interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath));
|
||||
try {
|
||||
log.debug("Add iterator from " + nextFilePath);
|
||||
if (emptyIterator) {
|
||||
recordIterator = new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator());
|
||||
emptyIterator = false;
|
||||
} else {
|
||||
recordIterator = Iterators.concat(recordIterator, new ADSIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator()));
|
||||
}
|
||||
} catch (CollectorServiceException e) {
|
||||
log.error("Failed collecting from path: " + nextFilePath, e);
|
||||
}
|
||||
}
|
||||
return new ADSIterable(recordIterator);
|
||||
}
|
||||
else {
|
||||
//singleRemoteFile
|
||||
return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"));
|
||||
}
|
||||
return new ADSIterable(recordIterator);
|
||||
}
|
||||
|
||||
public Iterable<String> collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement) throws CollectorServiceException {
|
||||
final String xml = httpConnector.getInputSource(baseUrl);
|
||||
BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8")));
|
||||
return new ADSIterable(new ADSIterator(new XMLIterator(splitOnElement, bis)));
|
||||
}
|
||||
|
||||
public Iterator<String> getRecordIterator() {
|
||||
return recordIterator;
|
||||
}
|
||||
|
||||
public void setRecordIterator(Iterator<String> recordIterator) {
|
||||
this.recordIterator = recordIterator;
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
public void setHttpConnector(HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.io.ByteArrayOutputStream;
|
|||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.ximpleware.*;
|
||||
import eu.dnetlib.data.collector.ThreadSafeIterator;
|
||||
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
|
||||
|
@ -13,27 +14,26 @@ import org.apache.commons.logging.LogFactory;
|
|||
public class ADSIterator extends ThreadSafeIterator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(ADSIterator.class);
|
||||
private Iterator<String> identifiers;
|
||||
private String baseUrl;
|
||||
private String suffix;
|
||||
|
||||
public ADSIterator(final Iterator<String> idIterator, final String baseUrl, final String suffix){
|
||||
this.identifiers = idIterator;
|
||||
this.baseUrl = baseUrl;
|
||||
this.suffix = suffix;
|
||||
private Iterator<String> iterator;
|
||||
|
||||
public ADSIterator(final Iterator<String> recordIterator){
|
||||
this.iterator = recordIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doHasNext() {
|
||||
return identifiers.hasNext();
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String doNext() {
|
||||
String record = identifiers.next();
|
||||
String record = iterator.next();
|
||||
try {
|
||||
return addADSNamespace(record);
|
||||
} catch (Exception e) {
|
||||
log.warn("Skipping record because of exception "+e);
|
||||
log.debug("Skipped record: "+record);
|
||||
if(this.hasNext()){
|
||||
return this.next();
|
||||
}
|
||||
|
@ -66,27 +66,12 @@ public class ADSIterator extends ThreadSafeIterator {
|
|||
}
|
||||
}
|
||||
|
||||
public Iterator<String> getIdentifiers() {
|
||||
return identifiers;
|
||||
public Iterator<String> getIterator() {
|
||||
return iterator;
|
||||
}
|
||||
|
||||
public void setIdentifiers(final Iterator<String> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
public void setIterator(final Iterator<String> iterator) {
|
||||
this.iterator = iterator;
|
||||
}
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(final String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public String getSuffix() {
|
||||
return suffix;
|
||||
}
|
||||
|
||||
public void setSuffix(final String suffix) {
|
||||
this.suffix = suffix;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
<list>
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
|
||||
p:name="splitOnElement"/>
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
|
||||
p:name="singleRemoteFile" p:type="BOOLEAN" p:optional="true"/>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
|
|
Loading…
Reference in New Issue