package eu.dnetlib.data.collector.plugins.schemaorg; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.net.URL; import java.nio.charset.Charset; import java.util.Iterator; public class EndpointAccessIterator implements Iterator { private static final Log log = LogFactory.getLog(EndpointAccessIterator.class); public static class Options { private Charset charset; public Options(){} public Options(Charset charset) { this.charset = charset; } public Charset getCharset() { return charset; } public void setCharset(Charset charset) { this.charset = charset; } } private Options options; private Iterator repositoryIterator; public EndpointAccessIterator(Options options, Iterator repositoryIterator) { this.options = options; this.repositoryIterator = repositoryIterator; } @Override public boolean hasNext() { return this.repositoryIterator.hasNext(); } @Override public JSONObject next() { String endpoint = this.repositoryIterator.next(); if(endpoint == null) return null; log.debug(String.format("processing: %s", endpoint)); JSONObject dataset = this.extractDatasetRecord(endpoint); return dataset; } private JSONObject extractDatasetRecord(String endpoint) { JSONObject datasetDocument = null; try { URL urlEndpoint = new URL(endpoint); log.debug("downloading endpoint "+urlEndpoint); String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset()); log.trace("downloaded payload id: "+payload); Document doc = Jsoup.parse(payload); Elements scriptTags = doc.getElementsByTag("script"); for (Element scriptTag : scriptTags) { if (!scriptTag.hasAttr("type")) continue; String scriptType = scriptTag.attr("type"); if (!scriptType.equalsIgnoreCase("application/ld+json")) continue; String data = scriptTag.data(); JSONObject schemaItem = new JSONObject(data); String context = schemaItem.optString("@context"); String type = schemaItem.optString("@type"); if (context == null || type == null) continue; Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org"); Boolean isDataset = type.equalsIgnoreCase("dataset"); if (!isSchemaOrgContext || !isDataset) continue; log.debug(String.format("discovered dataset document: %s", schemaItem.toString())); datasetDocument = schemaItem; break; } }catch(Exception ex){ log.error("problem extracting dataset document. returning empty", ex); datasetDocument = null; } if(datasetDocument == null){ log.debug("did not find any dataset document in endpoint"); } else{ log.debug("found dataset document in endpoint :"+datasetDocument.toString()); } return datasetDocument; } }