108 lines
2.7 KiB
Java
108 lines
2.7 KiB
Java
package eu.dnetlib.data.collector.plugins.ariadneplus.ehri;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
import java.io.IOException;
|
|
import java.net.URL;
|
|
import java.util.Iterator;
|
|
|
|
import com.ximpleware.*;
|
|
import eu.dnetlib.data.collector.ThreadSafeIterator;
|
|
import eu.dnetlib.rmi.data.CollectorServiceRuntimeException;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
|
/**
|
|
* Created by Alessia Bardi on 03/05/2017.
|
|
*
|
|
* @author Alessia Bardi
|
|
*/
|
|
public class EHRIIterator extends ThreadSafeIterator {
|
|
|
|
private static final Log log = LogFactory.getLog(EHRIIterator.class);
|
|
private static int MAX_FAILED = 100;
|
|
private Iterator<String> identifiers;
|
|
private String baseUrl;
|
|
private String suffix;
|
|
private int failed = 0;
|
|
|
|
|
|
public EHRIIterator(final Iterator<String> idIterator, final String baseUrl, final String suffix){
|
|
this.identifiers = idIterator;
|
|
this.baseUrl = baseUrl;
|
|
this.suffix = suffix;
|
|
}
|
|
|
|
@Override
|
|
public boolean doHasNext() {
|
|
return identifiers.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public String doNext() {
|
|
String target = baseUrl+"/"+identifiers.next()+"/"+suffix;
|
|
log.debug("Getting "+target);
|
|
try {
|
|
URL url = new URL(target);
|
|
String record = IOUtils.toString(url, "UTF-8");
|
|
return removeDefaultEADNamespace(record);
|
|
} catch (IOException e) {
|
|
log.error("Unable to get "+target, e);
|
|
failed++;
|
|
if(failed > MAX_FAILED){
|
|
throw new CollectorServiceRuntimeException("Could not download more than "+MAX_FAILED+"documents from EHRI. Stopping.");
|
|
}
|
|
if(this.hasNext()){
|
|
return this.next();
|
|
}
|
|
else return "";
|
|
}
|
|
}
|
|
|
|
protected String removeDefaultEADNamespace(final String xml) {
|
|
try {
|
|
VTDGen vg = new VTDGen();
|
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
|
vg.setDoc(xml.getBytes());
|
|
vg.parse(false); // turn off namespace awareness so that
|
|
VTDNav vn = vg.getNav();
|
|
AutoPilot ap = new AutoPilot(vn);
|
|
XMLModifier xm = new XMLModifier(vn);
|
|
ap.selectXPath("/ead/@xmlns");
|
|
int i = 0;
|
|
while ((i = ap.evalXPath()) != -1) {
|
|
xm.remove();
|
|
}
|
|
xm.output(baos);
|
|
return baos.toString();
|
|
}catch(Exception e){
|
|
log.error("Cannot remove default namespace from ead element: "+xml);
|
|
throw new CollectorServiceRuntimeException("Cannot remove default namespace from ead element", e);
|
|
}
|
|
}
|
|
|
|
public Iterator<String> getIdentifiers() {
|
|
return identifiers;
|
|
}
|
|
|
|
public void setIdentifiers(final Iterator<String> identifiers) {
|
|
this.identifiers = identifiers;
|
|
}
|
|
|
|
public String getBaseUrl() {
|
|
return baseUrl;
|
|
}
|
|
|
|
public void setBaseUrl(final String baseUrl) {
|
|
this.baseUrl = baseUrl;
|
|
}
|
|
|
|
public String getSuffix() {
|
|
return suffix;
|
|
}
|
|
|
|
public void setSuffix(final String suffix) {
|
|
this.suffix = suffix;
|
|
}
|
|
}
|