package eu.dnetlib.data.collector.plugins.datacite; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URL; import java.util.ArrayDeque; import java.util.Iterator; import java.util.Objects; import java.util.Queue; import java.util.zip.DataFormatException; import java.util.zip.Inflater; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import eu.dnetlib.data.collector.plugins.datacite.schema.DataciteSchema; import eu.dnetlib.data.collector.plugins.datacite.schema.Result; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; public class DataciteESIterator implements Iterator { private final long timestamp; private String scrollId; private Queue currentPage; private final Gson g = new GsonBuilder().create(); private String baseURL = "http://ip-90-147-167-25.ct1.garrservices.it:5000"; private static final String START_PATH = "new_scan"; private static final String NEXT_PATH = "scan/%s"; public DataciteESIterator(long timestamp, String baseUrl) throws Exception { this.timestamp = timestamp; this.baseURL = baseUrl; currentPage = new ArrayDeque<>(); startRequest(); } private static String decompression(final Result r) { try { byte[] byteArray = Base64.decodeBase64(r.getBody().getBytes()); Inflater decompresser = new Inflater(); decompresser.setInput(byteArray); ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); byte[] buffer = new byte[8192]; while (!decompresser.finished()) { int size = decompresser.inflate(buffer); bos.write(buffer, 0, size); } byte[] unzippeddata = bos.toByteArray(); decompresser.end(); return new String(unzippeddata); } catch (DataFormatException e) { return null; } } private void fillQueue(final String hits) { if (StringUtils.isBlank(hits) || "[]".equalsIgnoreCase(hits.trim())) return; try { DataciteSchema datacitepage = g.fromJson(hits, DataciteSchema.class); this.scrollId = datacitepage.getScrollId(); datacitepage.getResult().stream().map(DataciteESIterator::decompression).filter(Objects::nonNull).forEach(this.currentPage::add); } catch (Throwable e) { System.out.println(hits); e.printStackTrace(); } } private void startRequest() throws Exception { String url = baseURL+"/"+START_PATH; final URL startUrl = new URL(timestamp >0 ? url + "?timestamp="+timestamp : url); fillQueue(IOUtils.toString(startUrl.openStream())); } private void getNextPage() throws IOException { String url = baseURL+"/"+NEXT_PATH; final URL startUrl = new URL(String.format(url,scrollId)); fillQueue(IOUtils.toString(startUrl.openStream())); } @Override public boolean hasNext() { return currentPage.size() >0; } @Override public String next() { if (currentPage.size() == 0) { return null; } String nextItem = currentPage.remove(); if (currentPage.size() == 0) { try { getNextPage(); } catch (Throwable e) { throw new RuntimeException(e); } } return nextItem; } public String getBaseURL() { return baseURL; } public void setBaseURL(final String baseURL) { this.baseURL = baseURL; } }