package eu.dnetlib.data.collector.plugins.ariadneplus.ehri; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URL; import java.util.Iterator; import com.ximpleware.*; import eu.dnetlib.data.collector.ThreadSafeIterator; import eu.dnetlib.rmi.data.CollectorServiceRuntimeException; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Created by Alessia Bardi on 03/05/2017. * * @author Alessia Bardi */ public class EHRIIterator extends ThreadSafeIterator { private static final Log log = LogFactory.getLog(EHRIIterator.class); private static int MAX_FAILED = 100; private Iterator identifiers; private String baseUrl; private String suffix; private int failed = 0; public EHRIIterator(final Iterator idIterator, final String baseUrl, final String suffix){ this.identifiers = idIterator; this.baseUrl = baseUrl; this.suffix = suffix; } @Override public boolean doHasNext() { return identifiers.hasNext(); } @Override public String doNext() { String target = baseUrl+"/"+identifiers.next()+"/"+suffix; log.debug("Getting "+target); try { URL url = new URL(target); String record = IOUtils.toString(url, "UTF-8"); return removeDefaultEADNamespace(record); } catch (IOException e) { log.error("Unable to get "+target, e); failed++; if(failed > MAX_FAILED){ throw new CollectorServiceRuntimeException("Could not download more than "+MAX_FAILED+"documents from EHRI. Stopping."); } if(this.hasNext()){ return this.next(); } else return ""; } } protected String removeDefaultEADNamespace(final String xml) { try { VTDGen vg = new VTDGen(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); vg.setDoc(xml.getBytes()); vg.parse(false); // turn off namespace awareness so that VTDNav vn = vg.getNav(); AutoPilot ap = new AutoPilot(vn); XMLModifier xm = new XMLModifier(vn); ap.selectXPath("/ead/@xmlns"); int i = 0; while ((i = ap.evalXPath()) != -1) { xm.remove(); } xm.output(baos); return baos.toString(); }catch(Exception e){ log.error("Cannot remove default namespace from ead element: "+xml); throw new CollectorServiceRuntimeException("Cannot remove default namespace from ead element", e); } } public Iterator getIdentifiers() { return identifiers; } public void setIdentifiers(final Iterator identifiers) { this.identifiers = identifiers; } public String getBaseUrl() { return baseUrl; } public void setBaseUrl(final String baseUrl) { this.baseUrl = baseUrl; } public String getSuffix() { return suffix; } public void setSuffix(final String suffix) { this.suffix = suffix; } }