AriadnePlus/dnet-ariadneplus/src/main/java/eu/dnetlib/data/collector/plugins/ariadneplus/ehri/EHRIIterator.java

108 lines
2.7 KiB
Java

package eu.dnetlib.data.collector.plugins.ariadneplus.ehri;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.Iterator;
import com.ximpleware.*;
import eu.dnetlib.data.collector.ThreadSafeIterator;
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Created by Alessia Bardi on 03/05/2017.
*
* @author Alessia Bardi
*/
public class EHRIIterator extends ThreadSafeIterator {
private static final Log log = LogFactory.getLog(EHRIIterator.class);
private static int MAX_FAILED = 100;
private Iterator<String> identifiers;
private String baseUrl;
private String suffix;
private int failed = 0;
public EHRIIterator(final Iterator<String> idIterator, final String baseUrl, final String suffix){
this.identifiers = idIterator;
this.baseUrl = baseUrl;
this.suffix = suffix;
}
@Override
public boolean doHasNext() {
return identifiers.hasNext();
}
@Override
public String doNext() {
String target = baseUrl+"/"+identifiers.next()+"/"+suffix;
log.debug("Getting "+target);
try {
URL url = new URL(target);
String record = IOUtils.toString(url, "UTF-8");
return removeDefaultEADNamespace(record);
} catch (IOException e) {
log.error("Unable to get "+target, e);
failed++;
if(failed > MAX_FAILED){
throw new CollectorServiceRuntimeException("Could not download more than "+MAX_FAILED+"documents from EHRI. Stopping.");
}
if(this.hasNext()){
return this.next();
}
else return "";
}
}
protected String removeDefaultEADNamespace(final String xml) {
try {
VTDGen vg = new VTDGen();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
vg.setDoc(xml.getBytes());
vg.parse(false); // turn off namespace awareness so that
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
XMLModifier xm = new XMLModifier(vn);
ap.selectXPath("/ead/@xmlns");
int i = 0;
while ((i = ap.evalXPath()) != -1) {
xm.remove();
}
xm.output(baos);
return baos.toString();
}catch(Exception e){
log.error("Cannot remove default namespace from ead element: "+xml);
throw new CollectorServiceRuntimeException("Cannot remove default namespace from ead element", e);
}
}
public Iterator<String> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(final Iterator<String> identifiers) {
this.identifiers = identifiers;
}
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(final String baseUrl) {
this.baseUrl = baseUrl;
}
public String getSuffix() {
return suffix;
}
public void setSuffix(final String suffix) {
this.suffix = suffix;
}
}