dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/httpapi/kaggle/KaggleRepositoryIterable.java

209 lines
6.4 KiB
Java

package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
import org.json.JSONObject;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
public static class Options {
private String queryUrl;
private String queryPagePlaceholder;
private Charset charset;
private String responsePropertyTotalDataset;
private String responsePropertyDatasetList;
private String responsePropertyDatasetUrl;
private String responseBaseDatasetUrl;
private long putTimeout;
private TimeUnit putTimeoutUnit;
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
private int queueSize;
public long getPutTimeout() {
return putTimeout;
}
public void setPutTimeout(long putTimeout) {
this.putTimeout = putTimeout;
}
public TimeUnit getPutTimeoutUnit() {
return putTimeoutUnit;
}
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
this.putTimeoutUnit = putTimeoutUnit;
}
public int getQueueSize() {
return queueSize;
}
public void setQueueSize(int queueSize) {
this.queueSize = queueSize;
}
public String getResponseBaseDatasetUrl() {
return responseBaseDatasetUrl;
}
public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
this.responseBaseDatasetUrl = responseBaseDatasetUrl;
}
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
return repositoryQueueIteratorOptions;
}
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
}
public String getResponsePropertyDatasetUrl() {
return responsePropertyDatasetUrl;
}
public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
}
public String getResponsePropertyDatasetList() {
return responsePropertyDatasetList;
}
public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
this.responsePropertyDatasetList = responsePropertyDatasetList;
}
public String getResponsePropertyTotalDataset() {
return responsePropertyTotalDataset;
}
public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
this.responsePropertyTotalDataset = responsePropertyTotalDataset;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
public String getQueryPagePlaceholder() {
return queryPagePlaceholder;
}
public void setQueryPagePlaceholder(String queryPagePlaceholder) {
this.queryPagePlaceholder = queryPagePlaceholder;
}
public String getQueryUrl() {
return queryUrl;
}
public void setQueryUrl(String queryUrl) {
this.queryUrl = queryUrl;
}
}
private Options options;
private ArrayBlockingQueue<String> queue;
public KaggleRepositoryIterable(Options options) {
this.options = options;
// this.currentPage = 1;
// this.terminated = false;
}
public void bootstrap() {
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
Thread ft = new Thread(new Harvester() );
ft.start();
// ExecutorService executor = Executors.newSingleThreadExecutor();
// executor.execute(new Harvester());
// executor.shutdown();
}
@Override
public Iterator<String> iterator() {
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
}
private class Harvester implements Runnable{
@Override
public void run() {
this.execute();
}
private void execute() {
try {
int currentPage = 1;
int totalDatasets = 0;
int readDatasets = 0;
while (true) {
String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage));
String response = IOUtils.toString(new URL(query), options.getCharset());
currentPage += 1;
JSONObject pageObject = new JSONObject(response);
totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset());
JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList());
if (datasets == null || datasets.length() == 0) break;
readDatasets += datasets.length();
for (int i = 0; i < datasets.length(); i += 1) {
JSONObject item = datasets.optJSONObject(i);
String urlFragment = item.optString(options.getResponsePropertyDatasetUrl());
if (urlFragment == null || urlFragment.trim().length() == 0) continue;
String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment);
log.debug("adding endpoint in queue");
log.debug("queue size: " + queue.size());
try {
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (InterruptedException ex) {
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
break;
}
log.debug("endpoint added in queue");
log.debug("queue size: " + queue.size());
}
if (readDatasets >= totalDatasets) break;
}
} catch (Exception ex) {
log.error("problem execution harvesting", ex);
} finally {
try {
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (Exception ex) {
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
}
}
}
}
}