new collector plugin (xmlsFolder) based on ads one, adding a new parameter which represents the list of namespaces that has to be added to the generated xml record during collection step

This commit is contained in:
Enrico Ottonello 2021-01-15 13:33:23 +01:00
parent 26c5714107
commit 290259fbc2
4 changed files with 235 additions and 0 deletions

View File

@ -0,0 +1,105 @@
package eu.dnetlib.data.collector.plugins.ariadneplus;
import com.google.common.collect.Iterators;
import eu.dnetlib.data.collector.plugins.FileCollectorPlugin;
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
import eu.dnetlib.rmi.data.CollectorServiceException;
import eu.dnetlib.rmi.data.InterfaceDescriptor;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import java.io.BufferedInputStream;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
public class XMLsFolderCollectorPlugin extends FileCollectorPlugin {
private Iterator<String> recordIterator;
@Autowired
private HttpConnector httpConnector;
/** The Constant log. */
private static final Log log = LogFactory.getLog(eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin.class);
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
String srf = interfaceDescriptor.getParams().get("singleRemoteFile");
boolean singleRemoteFile = Boolean.valueOf(srf);
log.debug("bool singleRemoteFile? "+singleRemoteFile);
final String namespaceList = interfaceDescriptor.getParams().get("namespaceList");
if (namespaceList == null || namespaceList.isEmpty()) { throw new CollectorServiceException("Param 'namespaceList' is null or empty"); }
if(!singleRemoteFile) {
String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl);
URL basePath;
try {
basePath = new URL(url);
} catch (MalformedURLException mue) {
log.error("Failed collecting from base url " + url, mue);
throw new CollectorServiceException(mue);
}
File baseDir = new File(basePath.getPath());
if (!baseDir.exists()) {
throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath()));
}
log.debug("Start collecting from folder " + baseDir + " ...");
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
boolean emptyIterator = true;
while (fsi.hasNext()) {
String nextFilePath = fsi.next();
interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath));
try {
log.debug("Add iterator from " + nextFilePath);
if (emptyIterator) {
recordIterator = new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList);
emptyIterator = false;
} else {
recordIterator = Iterators.concat(recordIterator, new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList));
}
} catch (CollectorServiceException e) {
log.error("Failed collecting from path: " + nextFilePath, e);
}
}
return new XMLsFolderIterable(recordIterator);
}
else {
//singleRemoteFile
return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"), namespaceList);
}
}
public Iterable<String> collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement, final String namespaceList) throws CollectorServiceException {
final String xml = httpConnector.getInputSource(baseUrl);
BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8")));
return new XMLsFolderIterable(new XMLsFolderIterator(new XMLIterator(splitOnElement, bis), namespaceList));
}
public Iterator<String> getRecordIterator() {
return recordIterator;
}
public void setRecordIterator(Iterator<String> recordIterator) {
this.recordIterator = recordIterator;
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
public void setHttpConnector(HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.data.collector.plugins.ariadneplus;
import java.util.Iterator;
public class XMLsFolderIterable implements Iterable<String>{
private Iterator<String> recordIterator;
public XMLsFolderIterable(Iterator<String> recordIterator ) {
this.recordIterator = recordIterator;
}
/**
* {@inheritDoc}
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
return recordIterator;
}
public Iterator<String> getRecordIterator() {
return recordIterator;
}
public void setRecordIterator(Iterator<String> recordIterator) {
this.recordIterator = recordIterator;
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.data.collector.plugins.ariadneplus;
import com.ximpleware.*;
import eu.dnetlib.data.collector.ThreadSafeIterator;
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Iterator;
public class XMLsFolderIterator extends ThreadSafeIterator {
private static final Log log = LogFactory.getLog(XMLsFolderIterator.class);
private Iterator<String> iterator;
private String namespaceList;
public XMLsFolderIterator(final Iterator<String> recordIterator, final String namespaceList){
this.iterator = recordIterator;
this.namespaceList = namespaceList;
}
@Override
public boolean doHasNext() {
return iterator.hasNext();
}
@Override
public String doNext() {
String record = iterator.next();
try {
return addCustomNamespace(record, getNamespaceList());
} catch (Exception e) {
log.warn("Skipping record because of exception "+e);
log.debug("Skipped record: "+record);
if(this.hasNext()){
return this.next();
}
else return "";
}
}
protected String addCustomNamespace(final String xml, String namespaceList) {
try {
VTDGen vg = new VTDGen();
vg.setDoc(xml.getBytes());
vg.parse(false); // namespace unaware to all name space nodes addressable using xpath @*
VTDNav vn = vg.getNav();
XMLModifier xm = new XMLModifier(vn);
namespaceList = " ".concat(namespaceList).concat(" ");
byte[] attrBytes = namespaceList.getBytes();
vn.toElement(VTDNav.ROOT);
xm.insertAttribute(attrBytes);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
xm.output(baos);
return baos.toString();
} catch(ParseException | ModifyException | NavException | IOException | TranscodeException e){
log.error("Cannot add namespace declarations to element: "+xml);
throw new CollectorServiceRuntimeException("Cannot add namespace declarations to element", e);
}
}
public Iterator<String> getIterator() {
return iterator;
}
public void setIterator(final Iterator<String> iterator) {
this.iterator = iterator;
}
public String getNamespaceList() {
return namespaceList;
}
public void setNamespaceList(String namespaceList) {
this.namespaceList = namespaceList;
}
}

View File

@ -43,4 +43,20 @@
</bean>
</property>
</bean>
<bean id="xmlsFolderCollectorPlugin" class="eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin">
<property name="protocolDescriptor">
<bean class="eu.dnetlib.rmi.data.ProtocolDescriptor" p:name="xmlsFolder">
<property name="params">
<list>
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
p:name="splitOnElement"/>
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
p:name="singleRemoteFile" p:type="BOOLEAN" p:optional="true"/>
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
p:name="namespaceList"/>
</list>
</property>
</bean>
</property>
</bean>
</beans>