new collector plugin (xmlsFolder) based on ads one, adding a new parameter which represents the list of namespaces that has to be added to the generated xml record during collection step
This commit is contained in:
parent
26c5714107
commit
290259fbc2
|
@ -0,0 +1,105 @@
|
|||
package eu.dnetlib.data.collector.plugins.ariadneplus;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
import eu.dnetlib.data.collector.plugins.FileCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.filesystem.FileSystemIterator;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.HttpConnector;
|
||||
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
|
||||
import eu.dnetlib.rmi.data.CollectorServiceException;
|
||||
import eu.dnetlib.rmi.data.InterfaceDescriptor;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class XMLsFolderCollectorPlugin extends FileCollectorPlugin {
|
||||
|
||||
private Iterator<String> recordIterator;
|
||||
@Autowired
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
/** The Constant log. */
|
||||
private static final Log log = LogFactory.getLog(eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin.class);
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
String srf = interfaceDescriptor.getParams().get("singleRemoteFile");
|
||||
boolean singleRemoteFile = Boolean.valueOf(srf);
|
||||
log.debug("bool singleRemoteFile? "+singleRemoteFile);
|
||||
final String namespaceList = interfaceDescriptor.getParams().get("namespaceList");
|
||||
if (namespaceList == null || namespaceList.isEmpty()) { throw new CollectorServiceException("Param 'namespaceList' is null or empty"); }
|
||||
if(!singleRemoteFile) {
|
||||
String url = (baseUrl.startsWith("file://")) ? baseUrl : "file://".concat(baseUrl);
|
||||
URL basePath;
|
||||
try {
|
||||
basePath = new URL(url);
|
||||
} catch (MalformedURLException mue) {
|
||||
log.error("Failed collecting from base url " + url, mue);
|
||||
throw new CollectorServiceException(mue);
|
||||
}
|
||||
|
||||
File baseDir = new File(basePath.getPath());
|
||||
if (!baseDir.exists()) {
|
||||
throw new CollectorServiceException(String.format("The base URL %s, does not exist", basePath.getPath()));
|
||||
}
|
||||
|
||||
log.debug("Start collecting from folder " + baseDir + " ...");
|
||||
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), "xml");
|
||||
|
||||
boolean emptyIterator = true;
|
||||
while (fsi.hasNext()) {
|
||||
String nextFilePath = fsi.next();
|
||||
interfaceDescriptor.setBaseUrl("file://".concat(nextFilePath));
|
||||
try {
|
||||
log.debug("Add iterator from " + nextFilePath);
|
||||
if (emptyIterator) {
|
||||
recordIterator = new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList);
|
||||
emptyIterator = false;
|
||||
} else {
|
||||
recordIterator = Iterators.concat(recordIterator, new XMLsFolderIterator(super.collect(interfaceDescriptor, fromDate, untilDate).iterator(), namespaceList));
|
||||
}
|
||||
} catch (CollectorServiceException e) {
|
||||
log.error("Failed collecting from path: " + nextFilePath, e);
|
||||
}
|
||||
}
|
||||
return new XMLsFolderIterable(recordIterator);
|
||||
}
|
||||
else {
|
||||
//singleRemoteFile
|
||||
return collectFromSingleRemoteFile(baseUrl, interfaceDescriptor.getParams().get("splitOnElement"), namespaceList);
|
||||
}
|
||||
}
|
||||
|
||||
public Iterable<String> collectFromSingleRemoteFile(final String baseUrl, final String splitOnElement, final String namespaceList) throws CollectorServiceException {
|
||||
final String xml = httpConnector.getInputSource(baseUrl);
|
||||
BufferedInputStream bis = new BufferedInputStream(IOUtils.toInputStream(xml, Charset.forName("utf-8")));
|
||||
return new XMLsFolderIterable(new XMLsFolderIterator(new XMLIterator(splitOnElement, bis), namespaceList));
|
||||
}
|
||||
|
||||
public Iterator<String> getRecordIterator() {
|
||||
return recordIterator;
|
||||
}
|
||||
|
||||
public void setRecordIterator(Iterator<String> recordIterator) {
|
||||
this.recordIterator = recordIterator;
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
public void setHttpConnector(HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.data.collector.plugins.ariadneplus;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
public class XMLsFolderIterable implements Iterable<String>{
|
||||
|
||||
private Iterator<String> recordIterator;
|
||||
|
||||
public XMLsFolderIterable(Iterator<String> recordIterator ) {
|
||||
this.recordIterator = recordIterator;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return recordIterator;
|
||||
}
|
||||
|
||||
public Iterator<String> getRecordIterator() {
|
||||
return recordIterator;
|
||||
}
|
||||
|
||||
public void setRecordIterator(Iterator<String> recordIterator) {
|
||||
this.recordIterator = recordIterator;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package eu.dnetlib.data.collector.plugins.ariadneplus;
|
||||
|
||||
import com.ximpleware.*;
|
||||
import eu.dnetlib.data.collector.ThreadSafeIterator;
|
||||
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class XMLsFolderIterator extends ThreadSafeIterator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(XMLsFolderIterator.class);
|
||||
|
||||
private Iterator<String> iterator;
|
||||
private String namespaceList;
|
||||
|
||||
public XMLsFolderIterator(final Iterator<String> recordIterator, final String namespaceList){
|
||||
|
||||
this.iterator = recordIterator;
|
||||
this.namespaceList = namespaceList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean doHasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String doNext() {
|
||||
String record = iterator.next();
|
||||
try {
|
||||
return addCustomNamespace(record, getNamespaceList());
|
||||
} catch (Exception e) {
|
||||
log.warn("Skipping record because of exception "+e);
|
||||
log.debug("Skipped record: "+record);
|
||||
if(this.hasNext()){
|
||||
return this.next();
|
||||
}
|
||||
else return "";
|
||||
}
|
||||
}
|
||||
|
||||
protected String addCustomNamespace(final String xml, String namespaceList) {
|
||||
|
||||
try {
|
||||
VTDGen vg = new VTDGen();
|
||||
vg.setDoc(xml.getBytes());
|
||||
vg.parse(false); // namespace unaware to all name space nodes addressable using xpath @*
|
||||
VTDNav vn = vg.getNav();
|
||||
XMLModifier xm = new XMLModifier(vn);
|
||||
|
||||
namespaceList = " ".concat(namespaceList).concat(" ");
|
||||
byte[] attrBytes = namespaceList.getBytes();
|
||||
|
||||
vn.toElement(VTDNav.ROOT);
|
||||
xm.insertAttribute(attrBytes);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
xm.output(baos);
|
||||
return baos.toString();
|
||||
} catch(ParseException | ModifyException | NavException | IOException | TranscodeException e){
|
||||
log.error("Cannot add namespace declarations to element: "+xml);
|
||||
throw new CollectorServiceRuntimeException("Cannot add namespace declarations to element", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Iterator<String> getIterator() {
|
||||
return iterator;
|
||||
}
|
||||
|
||||
public void setIterator(final Iterator<String> iterator) {
|
||||
this.iterator = iterator;
|
||||
}
|
||||
|
||||
public String getNamespaceList() {
|
||||
return namespaceList;
|
||||
}
|
||||
|
||||
public void setNamespaceList(String namespaceList) {
|
||||
this.namespaceList = namespaceList;
|
||||
}
|
||||
}
|
|
@ -43,4 +43,20 @@
|
|||
</bean>
|
||||
</property>
|
||||
</bean>
|
||||
<bean id="xmlsFolderCollectorPlugin" class="eu.dnetlib.data.collector.plugins.ariadneplus.XMLsFolderCollectorPlugin">
|
||||
<property name="protocolDescriptor">
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolDescriptor" p:name="xmlsFolder">
|
||||
<property name="params">
|
||||
<list>
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
|
||||
p:name="splitOnElement"/>
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
|
||||
p:name="singleRemoteFile" p:type="BOOLEAN" p:optional="true"/>
|
||||
<bean class="eu.dnetlib.rmi.data.ProtocolParameter"
|
||||
p:name="namespaceList"/>
|
||||
</list>
|
||||
</property>
|
||||
</bean>
|
||||
</property>
|
||||
</bean>
|
||||
</beans>
|
||||
|
|
Loading…
Reference in New Issue