dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/schemaorg/EndpointAccessIterator.java

107 lines
3.1 KiB
Java

package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
public class EndpointAccessIterator implements Iterator<JSONObject> {
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
public static class Options {
private Charset charset;
public Options(){}
public Options(Charset charset) {
this.charset = charset;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
}
private Options options;
private Iterator<String> repositoryIterator;
public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
this.options = options;
this.repositoryIterator = repositoryIterator;
}
@Override
public boolean hasNext() {
return this.repositoryIterator.hasNext();
}
@Override
public JSONObject next() {
String endpoint = this.repositoryIterator.next();
if(endpoint == null) return null;
log.debug(String.format("processing: %s", endpoint));
JSONObject dataset = this.extractDatasetRecord(endpoint);
return dataset;
}
private JSONObject extractDatasetRecord(String endpoint) {
JSONObject datasetDocument = null;
try {
URL urlEndpoint = new URL(endpoint);
log.debug("downloading endpoint "+urlEndpoint);
String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset());
log.trace("downloaded payload id: "+payload);
Document doc = Jsoup.parse(payload);
Elements scriptTags = doc.getElementsByTag("script");
for (Element scriptTag : scriptTags) {
if (!scriptTag.hasAttr("type")) continue;
String scriptType = scriptTag.attr("type");
if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
String data = scriptTag.data();
JSONObject schemaItem = new JSONObject(data);
String context = schemaItem.optString("@context");
String type = schemaItem.optString("@type");
if (context == null || type == null) continue;
Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
Boolean isDataset = type.equalsIgnoreCase("dataset");
if (!isSchemaOrgContext || !isDataset) continue;
log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
datasetDocument = schemaItem;
break;
}
}catch(Exception ex){
log.error("problem extracting dataset document. returning empty", ex);
datasetDocument = null;
}
if(datasetDocument == null){
log.debug("did not find any dataset document in endpoint");
}
else{
log.debug("found dataset document in endpoint :"+datasetDocument.toString());
}
return datasetDocument;
}
}