dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/projects/grist/GristProjectsIterable.java

137 lines
4.0 KiB
Java

package eu.dnetlib.data.collector.plugins.projects.grist;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import eu.dnetlib.enabling.resultset.SizedIterable;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
public class GristProjectsIterable implements SizedIterable<String> {
private static final Log log = LogFactory.getLog(GristProjectsIterable.class); // NOPMD by marko on 11/24/08 5:02 PM
private String queryURL;
private int total;
private SAXReader reader;
public GristProjectsIterable(String baseURL) throws CollectorServiceException {
queryURL = baseURL;
reader = new SAXReader();
total = getTotalCount();
}
@Override
public int getNumberOfElements() {
return total;
}
private int getTotalCount() throws CollectorServiceException {
try {
URL pageUrl = new URL(queryURL);
log.debug("Getting hit count from: " + pageUrl.toString());
String resultPage = IOUtils.toString(pageUrl);
Document doc = reader.read(IOUtils.toInputStream(resultPage));
String hitCount = doc.selectSingleNode("/Response/HitCount").getText();
return Integer.parseInt(hitCount);
} catch (NumberFormatException e) {
log.warn("Cannot set the total count from '/Response/HitCount'");
} catch (DocumentException e) {
throw new CollectorServiceException(e);
} catch (MalformedURLException e) {
throw new CollectorServiceException(e);
} catch (IOException e) {
throw new CollectorServiceException(e);
}
return -1;
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
private Queue<String> projects = new PriorityBlockingQueue<String>();
private boolean morePages = true;
private int pageNumber = 0;
private SAXReader reader = new SAXReader();
//The following is for debug only
private int nextCounter = 0;
@Override
public boolean hasNext() {
try {
fillProjectListIfNeeded();
} catch (CollectorServiceException e) {
throw new CollectorServiceRuntimeException(e);
}
return !projects.isEmpty();
}
@Override
public String next() {
nextCounter++;
log.debug(String.format("Calling next %s times. projects queue has %s elements", nextCounter, projects.size()));
try {
fillProjectListIfNeeded();
return projects.poll();
} catch (CollectorServiceException e) {
throw new CollectorServiceRuntimeException(e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private boolean fillProjectListIfNeeded() throws CollectorServiceException {
if (morePages && projects.isEmpty()) {
String resultPage = getNextPage();
Document doc = null;
try {
doc = reader.read(IOUtils.toInputStream(resultPage));
List<Element> records = doc.selectNodes("//RecordList/Record");
if (records != null && !records.isEmpty()) {
for (Element p : records) {
projects.add(p.asXML());
}
return true;
} else {
log.info("No more projects to read at page nr. " + pageNumber);
morePages = false;
return false;
}
} catch (DocumentException e) {
throw new CollectorServiceException(e);
}
} else return false;
}
private String getNextPage() {
pageNumber++;
try {
URL pageUrl = new URL(queryURL + "&page=" + pageNumber);
log.debug("Getting page at: " + pageUrl.toString());
return IOUtils.toString(pageUrl);
} catch (Exception e) {
throw new CollectorServiceRuntimeException("Error on page " + pageNumber, e);
}
}
};
}
}