209 lines
6.4 KiB
Java
209 lines
6.4 KiB
Java
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
|
|
|
|
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
|
|
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
|
|
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.logging.Log;
|
|
import org.apache.commons.logging.LogFactory;
|
|
import org.json.JSONArray;
|
|
import org.json.JSONObject;
|
|
|
|
import java.net.URL;
|
|
import java.nio.charset.Charset;
|
|
import java.util.Iterator;
|
|
import java.util.NoSuchElementException;
|
|
import java.util.concurrent.ArrayBlockingQueue;
|
|
import java.util.concurrent.ExecutorService;
|
|
import java.util.concurrent.Executors;
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
|
|
private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
|
|
|
|
public static class Options {
|
|
private String queryUrl;
|
|
private String queryPagePlaceholder;
|
|
private Charset charset;
|
|
private String responsePropertyTotalDataset;
|
|
private String responsePropertyDatasetList;
|
|
private String responsePropertyDatasetUrl;
|
|
private String responseBaseDatasetUrl;
|
|
private long putTimeout;
|
|
private TimeUnit putTimeoutUnit;
|
|
|
|
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
|
|
|
|
private int queueSize;
|
|
|
|
public long getPutTimeout() {
|
|
return putTimeout;
|
|
}
|
|
|
|
public void setPutTimeout(long putTimeout) {
|
|
this.putTimeout = putTimeout;
|
|
}
|
|
|
|
public TimeUnit getPutTimeoutUnit() {
|
|
return putTimeoutUnit;
|
|
}
|
|
|
|
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
|
|
this.putTimeoutUnit = putTimeoutUnit;
|
|
}
|
|
|
|
public int getQueueSize() {
|
|
return queueSize;
|
|
}
|
|
|
|
public void setQueueSize(int queueSize) {
|
|
this.queueSize = queueSize;
|
|
}
|
|
|
|
public String getResponseBaseDatasetUrl() {
|
|
return responseBaseDatasetUrl;
|
|
}
|
|
|
|
public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
|
|
this.responseBaseDatasetUrl = responseBaseDatasetUrl;
|
|
}
|
|
|
|
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
|
|
return repositoryQueueIteratorOptions;
|
|
}
|
|
|
|
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
|
|
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
|
|
}
|
|
|
|
public String getResponsePropertyDatasetUrl() {
|
|
return responsePropertyDatasetUrl;
|
|
}
|
|
|
|
public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
|
|
this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
|
|
}
|
|
|
|
public String getResponsePropertyDatasetList() {
|
|
return responsePropertyDatasetList;
|
|
}
|
|
|
|
public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
|
|
this.responsePropertyDatasetList = responsePropertyDatasetList;
|
|
}
|
|
|
|
public String getResponsePropertyTotalDataset() {
|
|
return responsePropertyTotalDataset;
|
|
}
|
|
|
|
public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
|
|
this.responsePropertyTotalDataset = responsePropertyTotalDataset;
|
|
}
|
|
|
|
public Charset getCharset() {
|
|
return charset;
|
|
}
|
|
|
|
public void setCharset(Charset charset) {
|
|
this.charset = charset;
|
|
}
|
|
|
|
public String getQueryPagePlaceholder() {
|
|
return queryPagePlaceholder;
|
|
}
|
|
|
|
public void setQueryPagePlaceholder(String queryPagePlaceholder) {
|
|
this.queryPagePlaceholder = queryPagePlaceholder;
|
|
}
|
|
|
|
public String getQueryUrl() {
|
|
return queryUrl;
|
|
}
|
|
|
|
public void setQueryUrl(String queryUrl) {
|
|
this.queryUrl = queryUrl;
|
|
}
|
|
}
|
|
|
|
private Options options;
|
|
private ArrayBlockingQueue<String> queue;
|
|
|
|
public KaggleRepositoryIterable(Options options) {
|
|
this.options = options;
|
|
// this.currentPage = 1;
|
|
// this.terminated = false;
|
|
}
|
|
|
|
public void bootstrap() {
|
|
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
|
|
|
|
Thread ft = new Thread(new Harvester() );
|
|
ft.start();
|
|
// ExecutorService executor = Executors.newSingleThreadExecutor();
|
|
// executor.execute(new Harvester());
|
|
// executor.shutdown();
|
|
}
|
|
|
|
@Override
|
|
public Iterator<String> iterator() {
|
|
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
|
|
}
|
|
|
|
private class Harvester implements Runnable{
|
|
|
|
@Override
|
|
public void run() {
|
|
this.execute();
|
|
}
|
|
private void execute() {
|
|
try {
|
|
int currentPage = 1;
|
|
int totalDatasets = 0;
|
|
int readDatasets = 0;
|
|
while (true) {
|
|
String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage));
|
|
String response = IOUtils.toString(new URL(query), options.getCharset());
|
|
currentPage += 1;
|
|
|
|
JSONObject pageObject = new JSONObject(response);
|
|
totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset());
|
|
JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList());
|
|
|
|
if (datasets == null || datasets.length() == 0) break;
|
|
|
|
readDatasets += datasets.length();
|
|
|
|
for (int i = 0; i < datasets.length(); i += 1) {
|
|
JSONObject item = datasets.optJSONObject(i);
|
|
String urlFragment = item.optString(options.getResponsePropertyDatasetUrl());
|
|
if (urlFragment == null || urlFragment.trim().length() == 0) continue;
|
|
String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment);
|
|
|
|
log.debug("adding endpoint in queue");
|
|
log.debug("queue size: " + queue.size());
|
|
|
|
try {
|
|
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
|
} catch (InterruptedException ex) {
|
|
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
|
|
break;
|
|
}
|
|
log.debug("endpoint added in queue");
|
|
log.debug("queue size: " + queue.size());
|
|
}
|
|
|
|
if (readDatasets >= totalDatasets) break;
|
|
}
|
|
} catch (Exception ex) {
|
|
log.error("problem execution harvesting", ex);
|
|
} finally {
|
|
try {
|
|
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
|
} catch (Exception ex) {
|
|
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|