package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle; import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable; import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator; import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.json.JSONArray; import org.json.JSONObject; import java.net.URL; import java.nio.charset.Charset; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; public class KaggleRepositoryIterable implements HttpApiRepositoryIterable { private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class); public static class Options { private String queryUrl; private String queryPagePlaceholder; private Charset charset; private String responsePropertyTotalDataset; private String responsePropertyDatasetList; private String responsePropertyDatasetUrl; private String responseBaseDatasetUrl; private long putTimeout; private TimeUnit putTimeoutUnit; private RepositoryQueueIterator.Options repositoryQueueIteratorOptions; private int queueSize; public long getPutTimeout() { return putTimeout; } public void setPutTimeout(long putTimeout) { this.putTimeout = putTimeout; } public TimeUnit getPutTimeoutUnit() { return putTimeoutUnit; } public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) { this.putTimeoutUnit = putTimeoutUnit; } public int getQueueSize() { return queueSize; } public void setQueueSize(int queueSize) { this.queueSize = queueSize; } public String getResponseBaseDatasetUrl() { return responseBaseDatasetUrl; } public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) { this.responseBaseDatasetUrl = responseBaseDatasetUrl; } public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() { return repositoryQueueIteratorOptions; } public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) { this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions; } public String getResponsePropertyDatasetUrl() { return responsePropertyDatasetUrl; } public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) { this.responsePropertyDatasetUrl = responsePropertyDatasetUrl; } public String getResponsePropertyDatasetList() { return responsePropertyDatasetList; } public void setResponsePropertyDatasetList(String responsePropertyDatasetList) { this.responsePropertyDatasetList = responsePropertyDatasetList; } public String getResponsePropertyTotalDataset() { return responsePropertyTotalDataset; } public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) { this.responsePropertyTotalDataset = responsePropertyTotalDataset; } public Charset getCharset() { return charset; } public void setCharset(Charset charset) { this.charset = charset; } public String getQueryPagePlaceholder() { return queryPagePlaceholder; } public void setQueryPagePlaceholder(String queryPagePlaceholder) { this.queryPagePlaceholder = queryPagePlaceholder; } public String getQueryUrl() { return queryUrl; } public void setQueryUrl(String queryUrl) { this.queryUrl = queryUrl; } } private Options options; private ArrayBlockingQueue queue; public KaggleRepositoryIterable(Options options) { this.options = options; // this.currentPage = 1; // this.terminated = false; } public void bootstrap() { this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize()); Thread ft = new Thread(new Harvester() ); ft.start(); // ExecutorService executor = Executors.newSingleThreadExecutor(); // executor.execute(new Harvester()); // executor.shutdown(); } @Override public Iterator iterator() { return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue); } private class Harvester implements Runnable{ @Override public void run() { this.execute(); } private void execute() { try { int currentPage = 1; int totalDatasets = 0; int readDatasets = 0; while (true) { String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage)); String response = IOUtils.toString(new URL(query), options.getCharset()); currentPage += 1; JSONObject pageObject = new JSONObject(response); totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset()); JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList()); if (datasets == null || datasets.length() == 0) break; readDatasets += datasets.length(); for (int i = 0; i < datasets.length(); i += 1) { JSONObject item = datasets.optJSONObject(i); String urlFragment = item.optString(options.getResponsePropertyDatasetUrl()); if (urlFragment == null || urlFragment.trim().length() == 0) continue; String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment); log.debug("adding endpoint in queue"); log.debug("queue size: " + queue.size()); try { queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit()); } catch (InterruptedException ex) { log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit())); break; } log.debug("endpoint added in queue"); log.debug("queue size: " + queue.size()); } if (readDatasets >= totalDatasets) break; } } catch (Exception ex) { log.error("problem execution harvesting", ex); } finally { try { queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit()); } catch (Exception ex) { log.fatal("could not add termination hint. the process will not terminate gracefully", ex); } } } } }