package eu.dnetlib.data.collector.plugins.oaisets; import java.io.StringReader; import java.util.Iterator; import java.util.Queue; import java.util.Set; import java.util.concurrent.PriorityBlockingQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.Node; import org.dom4j.io.SAXReader; import com.google.common.collect.Sets; import eu.dnetlib.data.collector.plugins.HttpConnector; import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner; import eu.dnetlib.data.collector.rmi.CollectorServiceException; public class OaiSetsIterator implements Iterator { private static final Log log = LogFactory.getLog(OaiSetsIterator.class); private Queue queue = new PriorityBlockingQueue(); private SAXReader reader = new SAXReader(); private String baseUrl; private String token; private boolean started; private HttpConnector httpConnector; private Set setsAlreadySeen = Sets.newHashSet(); public OaiSetsIterator(final String baseUrl, final HttpConnector httpConnector) { this.baseUrl = baseUrl; this.started = false; this.httpConnector = httpConnector; } private void verifyStarted() { if (!this.started) { this.started = true; try { this.token = firstPage(); } catch (CollectorServiceException e) { throw new RuntimeException(e); } } } @Override public boolean hasNext() { synchronized (queue) { verifyStarted(); return !queue.isEmpty(); } } @Override public String next() { synchronized (queue) { verifyStarted(); final String res = queue.poll(); while (queue.isEmpty() && (token != null) && !token.isEmpty()) { try { token = otherPages(token); } catch (CollectorServiceException e) { throw new RuntimeException(e); } } return res; } } @Override public void remove() {} private String firstPage() throws CollectorServiceException { final String url = baseUrl + "?verb=ListSets"; log.info("Start harvesting using url: " + url); return downloadPage(url); } private String otherPages(final String resumptionToken) throws CollectorServiceException { return downloadPage(baseUrl + "?verb=ListSets&resumptionToken=" + resumptionToken); } private String downloadPage(final String url) throws CollectorServiceException { final String xml = httpConnector.getInputSource(url); Document doc; try { doc = reader.read(new StringReader(xml)); } catch (DocumentException e) { log.warn("Error parsing xml, I try to clean it: " + xml, e); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { doc = reader.read(new StringReader(cleaned)); } catch (DocumentException e1) { throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1); } } final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); if (errorNode != null) { final String code = errorNode.valueOf("@code"); if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { log.warn("noRecordsMatch for oai call: " + url); return null; } else throw new CollectorServiceException(code + " - " + errorNode.getText()); } boolean sawAllSets = true; for (Object o : doc.selectNodes("//*[local-name()='ListSets']/*[local-name()='set']")) { String set = ((Element) o).valueOf("./*[local-name()='setSpec']"); if (!setsAlreadySeen.contains(set)) { sawAllSets = false; setsAlreadySeen.add(set); queue.add(((Node) o).asXML()); } } if (sawAllSets) { log.warn("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin."); System.out.println("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin."); return null; } else return doc.valueOf("//*[local-name()='resumptionToken']"); } }