imported dnet-modular-collector-service-rmi in dnet-core-components, imported dnet-modular-collector-service (and plugins) in dnet-data-services

This commit is contained in:
Claudio Atzori 2019-06-07 14:13:59 +02:00
parent 1c192fbfee
commit 7acac5986a
147 changed files with 29511 additions and 2 deletions

View File

@ -0,0 +1,12 @@
package eu.dnetlib.data.collector.functions;
import java.util.List;
import java.util.Map;
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
public interface ParamValuesFunction {
List<ProtocolParameterValue> findValues(String baseUrl, Map<String, String> params);
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.data.collector.plugin;
import java.util.List;
import org.springframework.beans.factory.annotation.Required;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
import eu.dnetlib.data.collector.rmi.ProtocolParameter;
public abstract class AbstractCollectorPlugin implements CollectorPlugin {
private ProtocolDescriptor protocolDescriptor;
@Override
public final String getProtocol() {
return getProtocolDescriptor().getName();
}
@Override
public final List<String> listNameParameters() {
return Lists.newArrayList(Lists.transform(getProtocolDescriptor().getParams(), new Function<ProtocolParameter, String>() {
@Override
public String apply(final ProtocolParameter p) {
return p.getName();
}
}));
}
@Override
public final ProtocolDescriptor getProtocolDescriptor() {
return protocolDescriptor;
}
@Required
public void setProtocolDescriptor(final ProtocolDescriptor protocolDescriptor) {
this.protocolDescriptor = protocolDescriptor;
}
}

View File

@ -0,0 +1,18 @@
package eu.dnetlib.data.collector.plugin;
import java.util.List;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
public interface CollectorPlugin {
Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String fromDate, String untilDate) throws CollectorServiceException;
ProtocolDescriptor getProtocolDescriptor();
String getProtocol();
List<String> listNameParameters();
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.data.collector.plugin;
import java.util.LinkedList;
public class CollectorPluginErrorLogList extends LinkedList<String> {
private static final long serialVersionUID = -6925786561303289704L;
@Override
public String toString() {
String log = new String();
int index = 0;
for (String errorMessage : this) {
log += String.format("Retry #%s: %s / ", index++, errorMessage);
}
return log;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.data.collector.rmi;
import java.util.List;
import java.util.Map;
import javax.jws.WebParam;
import javax.jws.WebService;
import javax.xml.ws.wsaddressing.W3CEndpointReference;
import eu.dnetlib.common.rmi.BaseService;
@WebService(targetNamespace = "http://services.dnetlib.eu/")
public interface CollectorService extends BaseService {
W3CEndpointReference collect(@WebParam(name = "interface") final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException;
W3CEndpointReference dateRangeCollect(
@WebParam(name = "interface") final InterfaceDescriptor interfaceDescriptor,
@WebParam(name = "from") final String from,
@WebParam(name = "until") final String until) throws CollectorServiceException;
List<ProtocolDescriptor> listProtocols();
List<ProtocolParameterValue> listValidValuesForParam(
@WebParam(name = "protocol") String protocol,
@WebParam(name = "baseUrl") String baseUrl,
@WebParam(name = "param") String param,
@WebParam(name = "otherParams") Map<String, String> otherParams) throws CollectorServiceException;
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.data.collector.rmi;
import eu.dnetlib.common.rmi.RMIException;
public class CollectorServiceException extends RMIException {
/**
*
*/
private static final long serialVersionUID = 7523999812098059764L;
public CollectorServiceException(String string) {
super(string);
}
public CollectorServiceException(String string, Throwable exception) {
super(string, exception);
}
public CollectorServiceException(Throwable exception) {
super(exception);
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.data.collector.rmi;
public class CollectorServiceRuntimeException extends RuntimeException {
/**
*
*/
private static final long serialVersionUID = 6317717870955037359L;
public CollectorServiceRuntimeException(final String string) {
super(string);
}
public CollectorServiceRuntimeException(final String string, final Throwable exception) {
super(string, exception);
}
public CollectorServiceRuntimeException(final Throwable exception) {
super(exception);
}
}

View File

@ -0,0 +1,70 @@
package eu.dnetlib.data.collector.rmi;
import java.util.HashMap;
import javax.xml.bind.annotation.XmlRootElement;
import org.dom4j.Node;
import org.springframework.beans.factory.annotation.Required;
import com.google.common.collect.Maps;
@XmlRootElement
public class InterfaceDescriptor {
private String id;
private String baseUrl;
private String protocol;
private HashMap<String, String> params = Maps.newHashMap();
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(final String baseUrl) {
this.baseUrl = baseUrl;
}
public String getId() {
return id;
}
@Required
public void setId(final String id) {
this.id = id;
}
public HashMap<String, String> getParams() {
return params;
}
public void setParams(final HashMap<String, String> params) {
this.params = params;
}
public String getProtocol() {
return protocol;
}
public void setProtocol(final String protocol) {
this.protocol = protocol;
}
public static InterfaceDescriptor newInstance(final Node node) {
final InterfaceDescriptor ifc = new InterfaceDescriptor();
ifc.setId(node.valueOf("./@id"));
ifc.setBaseUrl(node.valueOf("./BASE_URL"));
ifc.setProtocol(node.valueOf("./ACCESS_PROTOCOL"));
for (Object o : node.selectNodes("./ACCESS_PROTOCOL/@*")) {
final Node n = (Node) o;
ifc.getParams().put(n.getName(), n.getText());
}
return ifc;
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.data.collector.rmi;
import java.util.ArrayList;
import java.util.List;
import javax.xml.bind.annotation.XmlRootElement;
import org.springframework.beans.factory.annotation.Required;
@XmlRootElement
public class ProtocolDescriptor {
private String name;
private List<ProtocolParameter> params = new ArrayList<ProtocolParameter>();
public ProtocolDescriptor() {}
public ProtocolDescriptor(final String name, final List<ProtocolParameter> params) {
this.name = name;
this.params = params;
}
public String getName() {
return name;
}
@Required
public void setName(final String name) {
this.name = name;
}
public List<ProtocolParameter> getParams() {
return params;
}
public void setParams(final List<ProtocolParameter> params) {
this.params = params;
}
}

View File

@ -0,0 +1,87 @@
package eu.dnetlib.data.collector.rmi;
import javax.xml.bind.annotation.XmlRootElement;
import javax.xml.bind.annotation.XmlTransient;
import org.springframework.beans.factory.annotation.Required;
import eu.dnetlib.data.collector.functions.ParamValuesFunction;
@XmlRootElement
public class ProtocolParameter {
private String name;
private boolean optional = false;
private ProtocolParameterType type = ProtocolParameterType.TEXT;
private String regex = null;
private transient ParamValuesFunction populateFunction = null;
private boolean functionPopulated = false;
public ProtocolParameter() {}
public ProtocolParameter(final String name, final boolean optional, final ProtocolParameterType type, final String regex) {
this(name, optional, type, regex, null);
}
public ProtocolParameter(final String name, final boolean optional, final ProtocolParameterType type, final String regex,
final ParamValuesFunction populateFunction) {
this.name = name;
this.optional = optional;
this.type = type;
this.regex = regex;
this.populateFunction = populateFunction;
this.functionPopulated = this.populateFunction != null;
}
public String getName() {
return name;
}
@Required
public void setName(final String name) {
this.name = name;
}
public boolean isOptional() {
return optional;
}
public void setOptional(final boolean optional) {
this.optional = optional;
}
public ProtocolParameterType getType() {
return type;
}
public void setType(final ProtocolParameterType type) {
this.type = type;
}
public String getRegex() {
return regex;
}
public void setRegex(final String regex) {
this.regex = regex;
}
@XmlTransient
public ParamValuesFunction getPopulateFunction() {
return populateFunction;
}
public void setPopulateFunction(final ParamValuesFunction populateFunction) {
this.populateFunction = populateFunction;
this.functionPopulated = this.populateFunction != null;
}
public boolean isFunctionPopulated() {
return functionPopulated;
}
public void setFunctionPopulated(final boolean functionPopulated) {
this.functionPopulated = functionPopulated;
}
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.data.collector.rmi;
import javax.xml.bind.annotation.XmlEnum;
@XmlEnum
public enum ProtocolParameterType {
TEXT, NUMBER, LIST, BOOLEAN
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.data.collector.rmi;
import javax.xml.bind.annotation.XmlRootElement;
@XmlRootElement
public class ProtocolParameterValue {
private String id;
private String name;
public ProtocolParameterValue() {}
public ProtocolParameterValue(final String id, final String name) {
this.id = id;
this.name = name;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
}

View File

@ -23,11 +23,42 @@
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.ximpleware</groupId> <groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId> <artifactId>vtd-xml</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
</dependency>
</dependencies> </dependencies>

View File

@ -0,0 +1,55 @@
package eu.dnetlib.data.collector;
import java.util.Collection;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanFactory;
import org.springframework.beans.factory.BeanFactoryAware;
import org.springframework.beans.factory.ListableBeanFactory;
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
public class CollectorPluginEnumerator implements BeanFactoryAware {
// private static final Log log = LogFactory.getLog(CollectorPluginEnumerator.class); // NOPMD by marko on 11/24/08 5:02 PM
/**
* bean factory.
*/
private ListableBeanFactory beanFactory;
/**
* Get all beans implementing the CollectorPlugin interface.
*
* @return the set of eu.dnetlib.data.collector.plugin.CollectorPlugin(s)
*/
public Collection<CollectorPlugin> getAll() {
return beanFactory.getBeansOfType(CollectorPlugin.class).values();
}
@Override
public void setBeanFactory(final BeanFactory beanFactory) throws BeansException {
this.beanFactory = (ListableBeanFactory) beanFactory;
}
public ListableBeanFactory getBeanFactory() {
return beanFactory;
}
/**
* Get given CollectorPlugin or throws exception.
*
* @param protocol the given protocol
* @return a CollectorPlugin compatible with the given protocol
* @throws CollectorServiceException when no suitable plugin is found
*/
public CollectorPlugin get(final String protocol) throws CollectorServiceException {
for (CollectorPlugin cp : getAll()) {
if (protocol.equalsIgnoreCase(cp.getProtocol())) {
return cp;
}
}
throw new CollectorServiceException("plugin not found for protocol: " + protocol);
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.data.collector;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import javax.xml.ws.wsaddressing.W3CEndpointReference;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorService;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
import eu.dnetlib.data.collector.rmi.ProtocolParameter;
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
import eu.dnetlib.enabling.resultset.IterableResultSetFactory;
import eu.dnetlib.enabling.tools.AbstractBaseService;
public class CollectorServiceImpl extends AbstractBaseService implements CollectorService {
@Resource
private CollectorPluginEnumerator collectorPluginEnumerator;
@Resource
private IterableResultSetFactory iterableResultSetFactory;
@Override
public W3CEndpointReference collect(final InterfaceDescriptor ifDescriptor) throws CollectorServiceException {
return dateRangeCollect(ifDescriptor, null, null);
}
@Override
public W3CEndpointReference dateRangeCollect(
final InterfaceDescriptor ifDescriptor, final String from, final String until)
throws CollectorServiceException {
final CollectorPlugin plugin = collectorPluginEnumerator.get(ifDescriptor.getProtocol());
if (!verifyParams(ifDescriptor.getParams().keySet(), Sets.newHashSet(plugin.listNameParameters()))) { throw new CollectorServiceException(
"Invalid parameters, valid: " + plugin.listNameParameters() + ", current: " + ifDescriptor.getParams().keySet()); }
final Iterable<String> iter = plugin.collect(ifDescriptor, from, until);
return iterableResultSetFactory.createIterableResultSet(iter);
}
@Override
public List<ProtocolDescriptor> listProtocols() {
final List<ProtocolDescriptor> list = Lists.newArrayList();
for (CollectorPlugin plugin : collectorPluginEnumerator.getAll()) {
list.add(plugin.getProtocolDescriptor());
}
return list;
}
@Override
public List<ProtocolParameterValue> listValidValuesForParam(final String protocol,
final String baseUrl,
final String param,
final Map<String, String> otherParams) throws CollectorServiceException {
final CollectorPlugin plugin = collectorPluginEnumerator.get(protocol);
for (ProtocolParameter pp : plugin.getProtocolDescriptor().getParams()) {
if (pp.getName().equals(param) && pp.isFunctionPopulated()) { return pp.getPopulateFunction().findValues(baseUrl, otherParams); }
}
return Lists.newArrayList();
}
private boolean verifyParams(final Set<String> curr, final Set<String> valid) {
return valid.containsAll(curr);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.data.collector.functions;
import java.io.StringReader;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.springframework.beans.factory.annotation.Required;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.data.collector.plugins.oaisets.OaiSetsIteratorFactory;
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
public class ListOaiSetsFunction implements ParamValuesFunction {
private OaiSetsIteratorFactory oaiSetsIteratorFactory;
@Override
public List<ProtocolParameterValue> findValues(final String baseUrl, final Map<String, String> params) {
final SAXReader reader = new SAXReader();
final Iterator<ProtocolParameterValue> iter = Iterators.transform(oaiSetsIteratorFactory.newIterator(baseUrl),
new Function<String, ProtocolParameterValue>() {
@Override
public ProtocolParameterValue apply(final String s) {
try {
final Document doc = reader.read(new StringReader(s));
final String id = doc.valueOf("//*[local-name()='setSpec']");
final String name = doc.valueOf("//*[local-name()='setName']");
return new ProtocolParameterValue(id,
(StringUtils.isBlank(name) || name.equalsIgnoreCase(id)) ? id : id + " - name: \"" + name + "\"");
} catch (final DocumentException e) {
throw new RuntimeException("Error in ListSets", e);
}
}
});
return Lists.newArrayList(iter);
}
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() {
return oaiSetsIteratorFactory;
}
@Required
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) {
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.data.collector.plugins;
import java.io.BufferedInputStream;
import java.util.Iterator;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
public abstract class AbstractSplittedRecordPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
final String element = interfaceDescriptor.getParams().get("splitOnElement");
if (StringUtils.isBlank(baseUrl)) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
if (StringUtils.isBlank(element)) { throw new CollectorServiceException("Param 'splitOnElement' is null or empty"); }
final BufferedInputStream bis = getBufferedInputStream(baseUrl);
return new Iterable<String>() {
@Override
public Iterator<String> iterator() {
return new XMLIterator(element, bis);
}
};
}
abstract protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException;
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.data.collector.plugins;
import java.io.BufferedInputStream;
import java.net.URL;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
public class ClasspathCollectorPlugin extends AbstractSplittedRecordPlugin {
@Override
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
try {
return new BufferedInputStream(getClass().getResourceAsStream(new URL(baseUrl).getPath()));
} catch (Exception e) {
throw new CollectorServiceException("Error dowloading url: " + baseUrl);
}
}
}

View File

@ -0,0 +1,149 @@
package eu.dnetlib.data.collector.plugins;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Please use eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin instead
*/
@Deprecated
public class FileCSVCollectorPlugin extends AbstractCollectorPlugin {
private static final Log log = LogFactory.getLog(FileCSVCollectorPlugin.class);
class FileCSVIterator implements Iterator<String> {
private String next;
private BufferedReader reader;
private String separator;
private String quote;
public FileCSVIterator(final BufferedReader reader, final String separator, final String quote) {
this.reader = reader;
this.separator = separator;
this.quote = quote;
next = calculateNext();
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public String next() {
final String s = next;
next = calculateNext();
return s;
}
private String calculateNext() {
try {
final Document document = DocumentHelper.createDocument();
final Element root = document.addElement("csvRecord");
String newLine = reader.readLine();
// FOR SOME FILES IT RETURN NULL ALSO IF THE FILE IS NOT READY DONE
if (newLine == null) {
newLine = reader.readLine();
}
if (newLine == null) {
log.info("there is no line, closing RESULT SET");
reader.close();
return null;
}
final String[] currentRow = newLine.split(separator);
if (currentRow != null) {
for (int i = 0; i < currentRow.length; i++) {
final String hAttribute = (headers != null) && (i < headers.length) ? headers[i] : "column" + i;
final Element row = root.addElement("column");
if (i == identifierNumber) {
row.addAttribute("isID", "true");
}
final String value = StringUtils.isBlank(quote) ? currentRow[i] : StringUtils.strip(currentRow[i], quote);
row.addAttribute("name", hAttribute).addText(value);
}
return document.asXML();
}
} catch (final IOException e) {
log.error("Error calculating next csv element", e);
}
return null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
private String[] headers = null;
private int identifierNumber;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String header = interfaceDescriptor.getParams().get("header");
final String separator = StringEscapeUtils.unescapeJava(interfaceDescriptor.getParams().get("separator"));
final String quote = interfaceDescriptor.getParams().get("quote");
identifierNumber = Integer.parseInt(interfaceDescriptor.getParams().get("identifier"));
URL u = null;
try {
u = new URL(interfaceDescriptor.getBaseUrl());
} catch (final MalformedURLException e1) {
throw new CollectorServiceException(e1);
}
final String baseUrl = u.getPath();
log.info("base URL = " + baseUrl);
try {
final BufferedReader br = new BufferedReader(new InputStreamReader(new BOMInputStream(new FileInputStream(baseUrl))));
if ((header != null) && "true".equals(header.toLowerCase())) {
final String[] tmpHeader = br.readLine().split(separator);
if (StringUtils.isNotBlank(quote)) {
int i = 0;
headers = new String[tmpHeader.length];
for (final String h : tmpHeader) {
headers[i] = StringUtils.strip(h, quote);
i++;
}
} else headers = tmpHeader;
}
return () -> new FileCSVIterator(br, separator, quote);
} catch (final Exception e) {
throw new CollectorServiceException(e);
}
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.data.collector.plugins;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.net.URL;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
@Override
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
try {
return new BufferedInputStream(new FileInputStream(new URL(baseUrl).getPath()));
} catch (Exception e) {
throw new CollectorServiceException("Error reading file " + baseUrl, e);
}
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.data.collector.plugins;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.net.URL;
import java.util.zip.GZIPInputStream;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
@Override
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
try {
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(new URL(baseUrl).getPath()));
return new BufferedInputStream(stream);
} catch (Exception e) {
throw new CollectorServiceException(e);
}
}
}

View File

@ -0,0 +1,170 @@
package eu.dnetlib.data.collector.plugins;
import java.io.*;
import java.net.URL;
import java.util.Iterator;
import java.util.Set;
import com.google.common.collect.Iterators;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
/**
* The Class HttpCSVCollectorPlugin.
*/
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin {
private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class);
public static final String UTF8_BOM = "\uFEFF";
/**
* The Class HTTPCSVIterator.
*/
class HTTPCSVIterator implements Iterable<String> {
/** The descriptor. */
private InterfaceDescriptor descriptor;
/**
* Instantiates a new HTTPCSV iterator.
*
* @param descriptor
* the descriptor
*/
public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
this.descriptor = descriptor;
}
/**
* Iterator.
*
* @return the iterator
*/
@SuppressWarnings("resource")
@Override
public Iterator<String> iterator() {
try {
final String separator = descriptor.getParams().get("separator");
final String identifier = descriptor.getParams().get("identifier");
final String quote = descriptor.getParams().get("quote");
final URL url = new URL(descriptor.getBaseUrl());
long nLines = 0;
// FIX
// This code should skip the lines with invalid quotes
final File tempFile = File.createTempFile("csv-", ".tmp");
try (InputStream is = url.openConnection().getInputStream();
BOMInputStream bomIs = new BOMInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
FileWriter fw = new FileWriter(tempFile)) {
String line;
while ((line = reader.readLine()) != null) {
if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
fw.write(line);
fw.write("\n");
nLines++;
}
}
}
// END FIX
final CSVFormat format = CSVFormat.EXCEL
.withHeader()
.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
.withTrim();
final CSVParser parser = new CSVParser(new FileReader(tempFile), format);
final Set<String> headers = parser.getHeaderMap().keySet();
final long nRecords = nLines - 1;
return Iterators.transform(parser.iterator(), input -> {
try {
final Document document = DocumentHelper.createDocument();
final Element root = document.addElement("csvRecord");
for (final String key : headers) {
final Element row = root.addElement("column");
row.addAttribute("name", key).addText(XmlCleaner.cleanAllEntities(input.get(key)));
if (key.equals(identifier)) {
row.addAttribute("isID", "true");
}
}
return document.asXML();
} finally {
log.debug(tempFile.getAbsolutePath());
if (parser.getRecordNumber() == nRecords) {
log.debug("DELETING " + tempFile.getAbsolutePath());
tempFile.delete();
}
}
});
} catch (final Exception e) {
log.error("Error iterating csv lines", e);
return null;
}
}
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
* java.lang.String)
*/
@Override
public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
return new HTTPCSVIterator(descriptor);
}
public boolean verifyQuotes(final String line, final char separator) {
final char[] cs = line.trim().toCharArray();
boolean inField = false;
boolean skipNext = false;
for (int i = 0; i < cs.length; i++) {
if (skipNext) {
skipNext = false;
} else if (inField) {
if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
inField = false;
} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
if ((cs[i + 1] == '\"')) {
skipNext = true;
} else {
log.warn("Skipped invalid line: " + line);
return false;
}
}
} else {
if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
inField = true;
}
}
}
if (inField) {
log.warn("Skipped invalid line: " + line);
return false;
}
return true;
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.data.collector.plugins;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
public class HttpCollectorPlugin extends AbstractSplittedRecordPlugin {
@Override
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
final HttpGet method = new HttpGet(baseUrl);
try(CloseableHttpResponse response = HttpClients.createDefault().execute(method)) {
int responseCode = response.getStatusLine().getStatusCode();
if (HttpStatus.SC_OK != responseCode) {
throw new CollectorServiceException("Error " + responseCode + " dowloading url: " + baseUrl);
}
byte[] content = IOUtils.toByteArray(response.getEntity().getContent());
try(InputStream in = new ByteArrayInputStream(content)) {
return new BufferedInputStream(in);
}
} catch (IOException e) {
throw new CollectorServiceException("Error dowloading url: " + baseUrl);
}
}
}

View File

@ -0,0 +1,224 @@
package eu.dnetlib.data.collector.plugins;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.security.GeneralSecurityException;
import java.security.cert.X509Certificate;
import java.util.List;
import java.util.Map;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* @author jochen, michele, andrea
*/
public class HttpConnector {
private static final Log log = LogFactory.getLog(HttpConnector.class);
private int maxNumberOfRetry = 6;
private int defaultDelay = 120; // seconds
private int readTimeOut = 120; // seconds
private String responseType = null;
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector() {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl) throws CollectorServiceException {
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
}
/**
* Given the URL returns the content as a stream via HTTP GET
*
* @param requestUrl the URL
* @return the content of the downloaded resource as InputStream
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
}
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
try {
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
try {
return IOUtils.toString(s);
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
}
finally{
IOUtils.closeQuietly(s);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
throws CollectorServiceException {
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
try {
InputStream input = null;
try {
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(readTimeOut * 1000);
urlConn.addRequestProperty("User-Agent", userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
log.warn("waiting and repeating request after " + retryAfter + " sec.");
Thread.sleep(retryAfter * 1000);
errorList.add("503 Service Unavailable");
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode()
== HttpURLConnection.HTTP_MOVED_TEMP)) {
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.debug("The requested url has been moved to " + newUrl);
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
urlConn.disconnect();
return attemptDownload(newUrl, retryNumber + 1, errorList);
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
Thread.sleep(defaultDelay * 1000);
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, errorList);
} else {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
} catch (IOException e) {
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
Thread.sleep(defaultDelay * 1000);
errorList.add(e.getMessage());
return attemptDownload(requestUrl, retryNumber + 1, errorList);
}
} catch (InterruptedException e) {
throw new CollectorServiceException(e);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: " + urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: " + e.getKey() + " - value: " + v);
}
}
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer
.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
}
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
}
/**
* register for https scheme; this is a workaround and not intended for the use in trusted environments
*/
public void initTrustManager() {
final X509TrustManager tm = new X509TrustManager() {
@Override
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
try {
final SSLContext ctx = SSLContext.getInstance("TLS");
ctx.init(null, new TrustManager[] { tm }, null);
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
} catch (GeneralSecurityException e) {
log.fatal(e);
throw new IllegalStateException(e);
}
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getDefaultDelay() {
return defaultDelay;
}
public void setDefaultDelay(final int defaultDelay) {
this.defaultDelay = defaultDelay;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(final int readTimeOut) {
this.readTimeOut = readTimeOut;
}
public String getResponseType() {
return responseType;
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.data.collector.plugins.archive.targz;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Collector pluging for collecting a .tar.gz folder of records
*
* @author andrea
*
*/
public class TarGzCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
return new TarGzIterable(interfaceDescriptor);
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.data.collector.plugins.archive.targz;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* The Class TarGzIterable.
*
* @author Andrea
*/
public class TarGzIterable implements Iterable<String> {
/** The path to tar.gz archive. */
private File tarGzFile;
public TarGzIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException {
try {
final String tarGzPath = interfaceDescriptor.getBaseUrl();
URL tarGzUrl = new URL(tarGzPath);
this.tarGzFile = new File(tarGzUrl.getPath());
if (!tarGzFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", tarGzFile.getPath())); }
} catch (MalformedURLException e) {
throw new CollectorServiceException("TarGz collector failed! ", e);
}
}
@Override
public Iterator<String> iterator() {
final TarGzIterator tgzIterator = new TarGzIterator(tarGzFile.getAbsolutePath());
return Iterators.transform(tgzIterator, new Function<String, String>() {
@Override
public String apply(final String inputRecord) {
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord);
}
});
}
}

View File

@ -0,0 +1,86 @@
package eu.dnetlib.data.collector.plugins.archive.targz;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class TarGzIterator implements Iterator<String> {
/** The Constant log. */
private static final Log log = LogFactory.getLog(TarGzIterator.class);
private TarArchiveInputStream tarInputStream;
private String current;
public TarGzIterator(final String tarGzPath) {
try {
this.tarInputStream = new TarArchiveInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(tarGzPath))));
this.current = findNext();
} catch (FileNotFoundException e) {
log.error("Tar.gz file not found: " + tarGzPath, e);
} catch (IOException e) {
log.error("Problem opening tar.gz file " + tarGzPath, e);
}
}
public TarGzIterator(final File tarGzFile) {
try {
this.tarInputStream = new TarArchiveInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(tarGzFile))));
this.current = findNext();
} catch (FileNotFoundException e) {
log.error("Tar.gz file not found: " + tarGzFile.getAbsolutePath(), e);
} catch (IOException e) {
log.error("Problem opening tar.gz file " + tarGzFile.getAbsolutePath(), e);
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public String next() {
String ret = new String(current);
current = findNext();
return ret;
}
@Override
public void remove() {}
private synchronized String findNext() {
TarArchiveEntry entry = null;
try {
while (null != (entry = tarInputStream.getNextTarEntry()) && !entry.isFile()) {
log.debug("Skipping TAR entry " + entry.getName());
}
} catch (IOException e) {
log.error("Error during tar.gz extraction", e);
}
if (entry == null) {
return null;
} else {
log.debug("Extracting " + entry.getName());
byte[] content = new byte[(int) entry.getSize()];
try {
tarInputStream.read(content, 0, content.length);
return new String(content);
} catch (IOException e) {
log.error("Impossible to extract file " + entry.getName(), e);
return null;
}
}
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.data.collector.plugins.archive.zip;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Collector pluging for collecting a zipped folder of records
*
* @author Andrea
*
*/
public class ZipCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
return new ZipIterable(interfaceDescriptor);
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.data.collector.plugins.archive.zip;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
*
* @author Andrea
*
*/
public class ZipIterable implements Iterable<String> {
/** The path to .zip archive. */
private File zipFile;
public ZipIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException {
try {
final String zipPath = interfaceDescriptor.getBaseUrl();
URL zipUrl = new URL(zipPath);
this.zipFile = new File(zipUrl.getPath());
if (!zipFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", zipFile.getPath())); }
} catch (MalformedURLException e) {
throw new CollectorServiceException("Zip collector failed! ", e);
}
}
@Override
public Iterator<String> iterator() {
final ZipIterator zipIterator = new ZipIterator(zipFile.getAbsolutePath());
return Iterators.transform(zipIterator, new Function<String, String>() {
@Override
public String apply(final String inputRecord) {
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord);
}
});
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.data.collector.plugins.archive.zip;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class ZipIterator implements Iterator<String> {
/** The Constant log. */
private static final Log log = LogFactory.getLog(ZipIterator.class);
ZipFile zipFile;
Enumeration<? extends ZipEntry> entries;
private String current;
public ZipIterator(final String zipPath) {
try {
this.zipFile = new ZipFile(zipPath);
this.entries = zipFile.entries();
this.current = findNext();
} catch (IOException e) {
log.error("Problems opening the .zip file " + zipPath, e);
}
}
public ZipIterator(final File file) {
try {
this.zipFile = new ZipFile(file);
this.entries = zipFile.entries();
this.current = findNext();
} catch (IOException e) {
log.error("Problems opening the .zip file " + zipFile.getName(), e);
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public String next() {
String ret = new String(current);
current = findNext();
return ret;
}
@Override
public void remove() {}
private synchronized String findNext() {
ZipEntry entry = null;
while (entries.hasMoreElements() && (entry = entries.nextElement()).isDirectory()) {
log.debug("Skipping Zip entry " + entry.getName());
}
if (entry == null) {
return null;
} else {
log.debug("Extracting " + entry.getName());
try {
InputStream stream = zipFile.getInputStream(entry);
return IOUtils.toString(stream);
} catch (IOException e) {
log.error("Problems extracting entry " + entry.getName(), e);
return null;
}
}
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.data.collector.plugins.datacite;
import java.text.ParseException;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class DataciteCollectorPlugin extends AbstractCollectorPlugin implements CollectorPlugin {
private static final Log log = LogFactory.getLog(DataciteCollectorPlugin.class);
private DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd");
@Override
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String fromDate, String untilDate) throws CollectorServiceException {
String baseurl = interfaceDescriptor.getBaseUrl();
if (StringUtils.isBlank(baseurl)) throw new CollectorServiceException("baseUrl cannot be empty");
long timestamp = 0;
if (StringUtils.isNotBlank(fromDate)) {
try {
Date date = org.apache.commons.lang.time.DateUtils.parseDate(
fromDate,
new String[] { "yyyy-MM-dd", "yyyy-MM-dd'T'HH:mm:ssXXX", "yyyy-MM-dd'T'HH:mm:ss.SSSX", "yyyy-MM-dd'T'HH:mm:ssZ",
"yyyy-MM-dd'T'HH:mm:ss.SX" });
//timestamp =parsed.getTime() /1000;
timestamp = date.toInstant().toEpochMilli() / 1000;
log.info("Querying for Datacite records from timestamp " + timestamp + " (date was " + fromDate + ")");
} catch (ParseException e) {
throw new CollectorServiceException(e);
}
}
final long finalTimestamp = timestamp;
return () -> {
try {
return new DataciteESIterator(finalTimestamp, baseurl);
} catch (Exception e) {
throw new RuntimeException(e);
}
};
}
}

View File

@ -0,0 +1,125 @@
package eu.dnetlib.data.collector.plugins.datacite;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayDeque;
import java.util.Iterator;
import java.util.Objects;
import java.util.Queue;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.data.collector.plugins.datacite.schema.DataciteSchema;
import eu.dnetlib.data.collector.plugins.datacite.schema.Result;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
public class DataciteESIterator implements Iterator<String> {
private final long timestamp;
private String scrollId;
private Queue<String> currentPage;
private final Gson g = new GsonBuilder().create();
private String baseURL = "http://ip-90-147-167-25.ct1.garrservices.it:5000";
private static final String START_PATH = "new_scan";
private static final String NEXT_PATH = "scan/%s";
public DataciteESIterator(long timestamp, String baseUrl) throws Exception {
this.timestamp = timestamp;
this.baseURL = baseUrl;
currentPage = new ArrayDeque<>();
startRequest();
}
private static String decompression(final Result r) {
try {
byte[] byteArray = Base64.decodeBase64(r.getBody().getBytes());
Inflater decompresser = new Inflater();
decompresser.setInput(byteArray);
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
byte[] buffer = new byte[8192];
while (!decompresser.finished()) {
int size = decompresser.inflate(buffer);
bos.write(buffer, 0, size);
}
byte[] unzippeddata = bos.toByteArray();
decompresser.end();
return new String(unzippeddata);
} catch (DataFormatException e) {
return null;
}
}
private void fillQueue(final String hits) {
if (StringUtils.isBlank(hits) || "[]".equalsIgnoreCase(hits.trim()))
return;
try {
DataciteSchema datacitepage = g.fromJson(hits, DataciteSchema.class);
this.scrollId = datacitepage.getScrollId();
datacitepage.getResult().stream().map(DataciteESIterator::decompression).filter(Objects::nonNull).forEach(this.currentPage::add);
} catch (Throwable e) {
System.out.println(hits);
e.printStackTrace();
}
}
private void startRequest() throws Exception {
String url = baseURL+"/"+START_PATH;
final URL startUrl = new URL(timestamp >0 ? url + "?timestamp="+timestamp : url);
fillQueue(IOUtils.toString(startUrl.openStream()));
}
private void getNextPage() throws IOException {
String url = baseURL+"/"+NEXT_PATH;
final URL startUrl = new URL(String.format(url,scrollId));
fillQueue(IOUtils.toString(startUrl.openStream()));
}
@Override
public boolean hasNext() {
return currentPage.size() >0;
}
@Override
public String next() {
if (currentPage.size() == 0) {
return null;
}
String nextItem = currentPage.remove();
if (currentPage.size() == 0) {
try {
getNextPage();
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
return nextItem;
}
public String getBaseURL() {
return baseURL;
}
public void setBaseURL(final String baseURL) {
this.baseURL = baseURL;
}
}

View File

@ -0,0 +1,55 @@
package eu.dnetlib.data.collector.plugins.datacite.schema;
import java.util.List;
import com.google.gson.annotations.Expose;
import com.google.gson.annotations.SerializedName;
public class DataciteSchema {
@SerializedName("counter")
@Expose
private Integer counter;
@SerializedName("result")
@Expose
private List<Result> result = null;
@SerializedName("scroll_id")
@Expose
private String scrollId;
@SerializedName("total")
@Expose
private Integer total;
public Integer getCounter() {
return counter;
}
public void setCounter(Integer counter) {
this.counter = counter;
}
public List<Result> getResult() {
return result;
}
public void setResult(List<Result> result) {
this.result = result;
}
public String getScrollId() {
return scrollId;
}
public void setScrollId(String scrollId) {
this.scrollId = scrollId;
}
public Integer getTotal() {
return total;
}
public void setTotal(Integer total) {
this.total = total;
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.data.collector.plugins.datacite.schema;
import com.google.gson.annotations.Expose;
import com.google.gson.annotations.SerializedName;
public class Result {
@SerializedName("body")
@Expose
private String body;
@SerializedName("id")
@Expose
private String id;
@SerializedName("originalId")
@Expose
private String originalId;
@SerializedName("timestamp")
@Expose
private Integer timestamp;
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public Integer getTimestamp() {
return timestamp;
}
public void setTimestamp(Integer timestamp) {
this.timestamp = timestamp;
}
}

View File

@ -0,0 +1,115 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.io.IOException;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* The Class DatasetsByProjectIterator.
*/
public class DatasetsByJournalIterator implements Iterable<String>, Iterator<String> {
/** The current iterator. */
private Iterator<String> currentIterator;
/** The current project. */
private PangaeaJournalInfo currentJournal;
private Iterator<PangaeaJournalInfo> inputIterator;
/** The logger. */
private static final Log log = LogFactory.getLog(DatasetsByProjectIterator.class);
public DatasetsByJournalIterator(final Iterator<PangaeaJournalInfo> iterator) {
this.inputIterator = iterator;
this.currentJournal = extractNextLine();
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
// CASE WHEN WE REACH THE LAST ITEM ON CSV
// OR WE HAD SOME PROBLEM ON GET NEXT CSV ITEM
if (this.currentJournal == null) { return false; }
// IN THIS CASE WE HAVE ANOTHER DATASETS
// FOR THE CURRENT PROJECT AND RETURN TRUE
if (currentIterator != null && currentIterator.hasNext()) { return true; }
// OTHERWISE WE FINISHED TO ITERATE THE CURRENT
// SETS OF DATASETS FOR A PARTICULAR PROJECT
// SO WE HAVE TO RETRIEVE THE NEXT ITERATOR WITH
// ITEMS
this.currentJournal = extractNextLine();
while (this.currentJournal != null) {
currentIterator = getNextIterator();
// IF THE NEXT ITERATOR HAS ITEMS RETURN YES
// OTHERWISE THE CICLE CONTINUE
if (currentIterator.hasNext()) { return true; }
this.currentJournal = extractNextLine();
}
return false;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#next()
*/
@Override
public String next() {
return this.currentIterator.next();
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {}
/*
* (non-Javadoc)
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
if (this.currentJournal != null) {
currentIterator = getNextIterator();
return this;
}
return null;
}
private Iterator<String> getNextIterator() {
QueryField q = new QueryField();
RequestField r = new RequestField();
r.setQuery(q);
q.getTerm().put("ft-techkeyword", this.currentJournal.getJournalId());
return new DatasetsIterator(r, "", this.currentJournal).iterator();
}
/**
* Extract next line.
*
* @return the map
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private PangaeaJournalInfo extractNextLine() {
if (this.inputIterator.hasNext() == false) { return null; }
return this.inputIterator.next();
}
}

View File

@ -0,0 +1,158 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Maps;
/**
* The Class DatasetsByProjectIterator.
*/
public class DatasetsByProjectIterator implements Iterable<String>, Iterator<String> {
private static final String SPLIT_REGEX = ";";
/** The project id key. */
public static String PROJECT_ID_KEY = "id";
/** The project name key. */
public static String PROJECT_NAME_KEY = "name";
/** The project corda id key. */
public static String PROJECT_CORDA_ID_KEY = "corda_id";
/** The current iterator. */
private Iterator<String> currentIterator;
/** The csv reader. */
private BufferedReader csvReader;
/** The current project. */
private Map<String, String> currentProject;
/** The logger. */
private static final Log log = LogFactory.getLog(DatasetsByProjectIterator.class);
/**
* Instantiates a new datasets by project iterator.
*
* @param csvInputStream
* the csv input stream
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public DatasetsByProjectIterator(final InputStreamReader csvInputStream) throws IOException {
this.csvReader = new BufferedReader(csvInputStream);
this.currentProject = extractNextLine();
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
// CASE WHEN WE REACH THE LAST ITEM ON CSV
// OR WE HAD SOME PROBLEM ON GET NEXT CSV ITEM
if (this.currentProject == null) { return false; }
// IN THIS CASE WE HAVE ANOTHER DATASETS
// FOR THE CURRENT PROJECT AND RETURN TRUE
if (currentIterator != null && currentIterator.hasNext()) { return true; }
// OTHERWISE WE FINISHED TO ITERATE THE CURRENT
// SETS OF DATASETS FOR A PARTICULAR PROJECT
// SO WE HAVE TO RETRIEVE THE NEXT ITERATOR WITH
// ITEMS
this.currentProject = extractNextLine();
while (this.currentProject != null) {
currentIterator = getNextIterator();
// IF THE NEXT ITERATOR HAS ITEMS RETURN YES
// OTHERWISE THE CICLE CONTINUE
if (currentIterator.hasNext()) { return true; }
this.currentProject = extractNextLine();
}
return false;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#next()
*/
@Override
public String next() {
return this.currentIterator.next();
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {}
/*
* (non-Javadoc)
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
if (this.currentProject != null) {
currentIterator = getNextIterator();
return this;
}
return null;
}
private Iterator<String> getNextIterator() {
QueryField q = new QueryField();
RequestField r = new RequestField();
r.setQuery(q);
q.getTerm().put("ft-techkeyword", this.currentProject.get(PROJECT_ID_KEY));
return new DatasetsIterator(r, this.currentProject.get(PROJECT_CORDA_ID_KEY), null).iterator();
}
/**
* Extract next line.
*
* @return the map
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Map<String, String> extractNextLine() {
String line;
try {
line = this.csvReader.readLine();
} catch (IOException e) {
return null;
}
// WE REACH THE END OF THE CSV
if (line == null) { return null; }
log.debug("splitting line: " + line);
String[] values = line.split(SPLIT_REGEX);
if (values == null || values.length != 4) {
log.error("Error on splitting line, the length must be 4");
return null;
}
int id = Integer.parseInt(values[0]);
String project_name = values[2];
String cordaId = values[3];
Map<String, String> splittedMap = Maps.newHashMap();
splittedMap.put(PROJECT_CORDA_ID_KEY, cordaId);
splittedMap.put(PROJECT_ID_KEY, "project" + id);
splittedMap.put(PROJECT_NAME_KEY, project_name);
log.debug(String.format("found project %s with id Corda: %s and id for API: %s", project_name, cordaId, "project" + id));
return splittedMap;
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
public class DatasetsByProjectPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
try {
URL url = new URL(interfaceDescriptor.getBaseUrl());
url.openConnection();
InputStreamReader reader = new InputStreamReader(url.openStream());
DatasetsByProjectIterator iterator = new DatasetsByProjectIterator(reader);
return iterator;
} catch (IOException e) {
throw new CollectorServiceException("OOOPS something bad happen on creating iterator ", e);
}
}
}

View File

@ -0,0 +1,274 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
/**
* The Class JournalIterator.
*/
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
/** The logger. */
private static final Log log = LogFactory.getLog(DatasetsIterator.class);
/** The base url template. */
private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
/** The journal id. */
private String journalId = "";
/** The journal name. */
private String journalName = "";
/** The journal issn. */
private String journalISSN = "";
/** The openaire datasource. */
private String openaireDatasource = "";
/** The total. */
private long total;
/** The from. */
private int from;
/** The current iterator. */
private int currentIterator;
/** The current response. */
private ElasticSearchResponse currentResponse;
/** The request. */
private RequestField request;
/** The default size. */
private static int DEFAULT_SIZE = 10;
private String projectCordaId;
private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>";
/**
* Instantiates a new journal iterator.
*
* @param request
* the request
*/
public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) {
this.request = request;
this.setProjectCordaId(projectCordaId);
if (info != null) {
this.setJournalId(info.getJournalId());
this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName()));
this.setJournalISSN(info.getJournalISSN());
this.setOpenaireDatasource(info.getDatasourceId());
}
log.debug("Start Iterator");
}
/**
* Execute query.
*
* @param from
* the from
* @param size
* the size
* @return the string
*/
private String executeQuery(final int from, final int size) {
log.debug("executing query " + this.request.getQuery().getTerm());
log.debug(String.format("from:%d size:%d", from, size));
CloseableHttpResponse response = null;
InputStream responseBody = null;
CloseableHttpClient httpclient = HttpClients.createDefault();
try {
HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from));
Gson g = new GsonBuilder().disableHtmlEscaping().create();
StringEntity entry = new StringEntity(g.toJson(this.request));
post.setEntity(entry);
long start = System.currentTimeMillis();
response = httpclient.execute(post);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 200) {
responseBody = response.getEntity().getContent();
String s = IOUtils.toString(responseBody);
log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms");
responseBody.close();
return s;
}
return null;
} catch (Exception e) {
log.error("Error on executing query :" + request.getQuery().getTerm(), e);
return null;
} finally {
try {
responseBody.close();
response.close();
httpclient.close();
} catch (IOException e) {
log.error("Can't close connections gracefully", e);
}
}
}
/**
* Gets the journal id.
*
* @return the journalId
*/
public String getJournalId() {
return journalId;
}
/**
* Sets the journal id.
*
* @param journalId
* the journalId to set
*/
public void setJournalId(final String journalId) {
this.journalId = journalId;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
return (from + currentIterator) < total;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#next()
*/
@Override
public String next() {
String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse
.getXmlRecords().get(currentIterator));
currentIterator++;
if (currentIterator == DEFAULT_SIZE) {
getNextItem();
}
return xml;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/*
* (non-Javadoc)
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
from = 0;
total = 0;
getNextItem();
return this;
}
/**
* Gets the next item.
*
* @return the next item
*/
private void getNextItem() {
from += currentIterator;
currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE));
total = currentResponse == null ? 0 : currentResponse.getTotal();
log.debug("from : " + from + " total of the request is " + total);
currentIterator = 0;
}
/**
* @return the projectCordaId
*/
public String getProjectCordaId() {
return projectCordaId;
}
/**
* @param projectCordaId
* the projectCordaId to set
*/
public void setProjectCordaId(final String projectCordaId) {
this.projectCordaId = projectCordaId;
}
/**
* @return the journalName
*/
public String getJournalName() {
return journalName;
}
/**
* @param journalName
* the journalName to set
*/
public void setJournalName(final String journalName) {
this.journalName = journalName;
}
/**
* @return the journalISSN
*/
public String getJournalISSN() {
return journalISSN;
}
/**
* @param journalISSN
* the journalISSN to set
*/
public void setJournalISSN(final String journalISSN) {
this.journalISSN = journalISSN;
}
/**
* @return the openaireDatasource
*/
public String getOpenaireDatasource() {
return openaireDatasource;
}
/**
* @param openaireDatasource
* the openaireDatasource to set
*/
public void setOpenaireDatasource(final String openaireDatasource) {
this.openaireDatasource = openaireDatasource;
}
}

View File

@ -0,0 +1,82 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
public class ElasticSearchResponse {
/** The logger. */
private static final Log log = LogFactory.getLog(ElasticSearchResponse.class);
private long total;
private List<String> xmlRecords;
public static ElasticSearchResponse createNewResponse(final String response) {
ElasticSearchResponse item = new ElasticSearchResponse();
if (response == null) {
log.fatal("Error: null elasticsearch reponse");
return null;
}
JsonElement jElement = new JsonParser().parse(response);
JsonObject jobject = jElement.getAsJsonObject();
if (jobject.has("hits")) {
item.setTotal(jobject.get("hits").getAsJsonObject().get("total").getAsLong());
JsonElement hits = ((JsonObject) jobject.get("hits")).get("hits");
JsonArray hitsObject = hits.getAsJsonArray();
List<String> records = new ArrayList<String>();
for (JsonElement elem : hitsObject) {
JsonObject _source = (JsonObject) ((JsonObject) elem).get("_source");
String xml = _source.get("xml").getAsString();
records.add(xml);
}
item.setXmlRecords(records);
return item;
}
return null;
}
/**
* @return the xmlRecords
*/
public List<String> getXmlRecords() {
return xmlRecords;
}
/**
* @param xmlRecords
* the xmlRecords to set
*/
public void setXmlRecords(final List<String> xmlRecords) {
this.xmlRecords = xmlRecords;
}
/**
* @return the total
*/
public long getTotal() {
return total;
}
/**
* @param total
* the total to set
*/
public void setTotal(final long total) {
this.total = total;
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.data.collector.plugins.datasets;
/**
* The Class PangaeaJorunalInfo.
*/
public class PangaeaJournalInfo {
/** The journal name. */
private String journalName;
/** The journal id. */
private String journalId;
/** The datasource id. */
private String datasourceId;
/** The journal issn. */
private String journalISSN;
/**
* Gets the journal name.
*
* @return the journal name
*/
public String getJournalName() {
return journalName;
}
/**
* Sets the journal name.
*
* @param journalName
* the new journal name
*/
public void setJournalName(final String journalName) {
this.journalName = journalName;
}
/**
* Gets the journal id.
*
* @return the journal id
*/
public String getJournalId() {
return journalId;
}
/**
* Sets the journal id.
*
* @param journalId
* the new journal id
*/
public void setJournalId(final String journalId) {
this.journalId = journalId;
}
/**
* Gets the datasource id.
*
* @return the datasource id
*/
public String getDatasourceId() {
return datasourceId;
}
/**
* Sets the datasource id.
*
* @param datasourceId
* the new datasource id
*/
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
/**
* @return the journalISSN
*/
public String getJournalISSN() {
return journalISSN;
}
/**
* @param journalISSN
* the journalISSN to set
*/
public void setJournalISSN(final String journalISSN) {
this.journalISSN = journalISSN;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.data.collector.plugins.datasets;
import java.util.HashMap;
import java.util.Map;
public class QueryField {
private Map<String, String> term;
public QueryField() {
setTerm(new HashMap<String, String>());
}
/**
* @return the term
*/
public Map<String, String> getTerm() {
return term;
}
/**
* @param term
* the term to set
*/
public void setTerm(final Map<String, String> term) {
this.term = term;
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.data.collector.plugins.datasets;
public class RequestField {
private QueryField query;
/**
* @return the query
*/
public QueryField getQuery() {
return query;
}
/**
* @param query the query to set
*/
public void setQuery(QueryField query) {
this.query = query;
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.data.collector.plugins.datasources;
import java.io.IOException;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.io.IOUtils;
import org.springframework.beans.factory.annotation.Autowired;
/**
* Plugin to collect metadata record about data repositories from re3data.
* <p>
* Documentation on re3data API: http://service.re3data.org/api/doc.
* </p>
* <p>
* BaseURL: http://service.re3data.org
* </p>
* <p>
* API to get the list of repos: baseURL + /api/v1/repositories
* </p>
* <p>
* API to get a repository: baseURL + content of link/@href of the above list
* </p>
*
* @author alessia
*
*/
public class Re3DataCollectorPlugin extends AbstractCollectorPlugin {
private String repositoryListPath = "/api/v1/repositories";
@Autowired
private HttpConnector httpConnector;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
String repositoryListURL = interfaceDescriptor.getBaseUrl() + repositoryListPath;
String input;
try {
input = httpConnector.getInputSource(repositoryListURL);
return new Re3DataRepositoriesIterator(IOUtils.toInputStream(input, "UTF-8"), interfaceDescriptor.getBaseUrl(), getHttpConnector());
} catch (IOException e) {
throw new CollectorServiceException(e);
}
}
public String getRepositoryListPath() {
return repositoryListPath;
}
public void setRepositoryListPath(final String repositoryListPath) {
this.repositoryListPath = repositoryListPath;
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
public void setHttpConnector(final HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
}

View File

@ -0,0 +1,151 @@
package eu.dnetlib.data.collector.plugins.datasources;
import java.io.InputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
public class Re3DataRepositoriesIterator implements Iterator<String>, Iterable<String> {
private static final Log log = LogFactory.getLog(Re3DataRepositoriesIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
private String baseURL;
private XMLStreamReader reader;
private int countedRepos = 0;
private String currentRepoPath = null;
private HttpConnector httpConnector;
@Override
public boolean hasNext() {
return currentRepoPath != null;
}
@Override
public String next() {
if (currentRepoPath == null) throw new NoSuchElementException();
try {
String repoInfo = getRepositoryInfo(currentRepoPath);
return repoInfo;
} finally {
currentRepoPath = moveToNextRepo();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public Iterator<String> iterator() {
return this;
}
public Re3DataRepositoriesIterator(final InputStream xmlInputStream, final String baseUrl, final HttpConnector httpConnector) throws CollectorServiceException {
this.httpConnector = httpConnector;
XMLInputFactory factory = XMLInputFactory.newInstance();
try {
reader = factory.createXMLStreamReader(xmlInputStream);
} catch (XMLStreamException e) {
throw new CollectorServiceException(e);
}
baseURL = baseUrl;
// try to fetch the 1st
currentRepoPath = moveToNextRepo();
}
private String getNextRepositoryPath() {
return reader.getAttributeValue(null, "href");
}
private String moveToNextRepo() {
try {
while (reader.hasNext()) {
int event = reader.next();
if (event == XMLStreamConstants.START_ELEMENT) {
String elementName = reader.getLocalName();
if (elementName.equals("link")) {
String repoPath = getNextRepositoryPath();
log.debug(String.format("Found %s repositories. The last has link %s", ++countedRepos, repoPath));
return repoPath;
}
}
}
log.info("Seems there are no more repository to iterate on. Total: " + countedRepos);
return null;
} catch (XMLStreamException e) {
throw new CollectorServiceRuntimeException(e);
}
}
private String getRepositoryInfo(final String repositoryPath) throws CollectorServiceRuntimeException {
String targetURL = repositoryPath;
if(!repositoryPath.startsWith(baseURL))
targetURL = baseURL + repositoryPath;
try {
log.info(targetURL);
String inputSource = getHttpConnector().getInputSource(targetURL);
return XmlCleaner.cleanAllEntities(inputSource);
} catch (CollectorServiceException e) {
throw new CollectorServiceRuntimeException("OOOPS something bad happen getting repo info from " + targetURL, e);
}
}
// public String testAccess(){
// return getRepositoryInfo("/api/v1/repository/r3d100012823");
// }
public String getBaseURL() {
return baseURL;
}
public void setBaseURL(final String baseURL) {
this.baseURL = baseURL;
}
public int getCountedRepos() {
return countedRepos;
}
public void setCountedRepos(final int countedRepos) {
this.countedRepos = countedRepos;
}
public XMLStreamReader getReader() {
return reader;
}
public void setReader(final XMLStreamReader reader) {
this.reader = reader;
}
public String getCurrentRepoPath() {
return currentRepoPath;
}
public void setCurrentRepoPath(final String currentRepoPath) {
this.currentRepoPath = currentRepoPath;
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
public void setHttpConnector(final HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.data.collector.plugins.excel;
/**
* Created by miriam on 10/05/2017.
*/
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.CSVFormat;
public class CSVFileWriter {
private static final String NEW_LINE_SEPARATOR = "\n";
private Object [] file_header ;
private ArrayList<ArrayList<String>> projects = new ArrayList<ArrayList<String>>();
public void setHeader(String[] header){
this.file_header = header;
}
public void addProject(ArrayList<String> project) {
projects.add(project);
}
public void writeFile(String csv_file_path){
BufferedWriter writer = null;
CSVPrinter csvFilePrinter = null;
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR);
try{
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv_file_path),"UTF-8"));
csvFilePrinter = new CSVPrinter(writer,csvFileFormat);
csvFilePrinter.printRecord(file_header);
for(ArrayList<String> project:projects){
csvFilePrinter.printRecord(project);
}
}catch(Exception e){
e.printStackTrace();
}finally{
try{
writer.flush();
writer.close();
csvFilePrinter.close();
}catch(IOException ioe){
ioe.printStackTrace();
}
}
}
}

View File

@ -0,0 +1,256 @@
package eu.dnetlib.data.collector.plugins.excel;
/**
* Created by miriam on 10/05/2017.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.json.*;
import org.apache.commons.io.FileUtils;
public class Read {
private static final Log log = LogFactory.getLog(Read.class);
/** The descriptor. */
private InterfaceDescriptor descriptor;
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
+ "\"col_currency\":10}"; */
private Sheet sheet;
private CSVFileWriter csv_writer = new CSVFileWriter();
private HashMap<String,String> map_header = new HashMap<String,String>();
private HashMap<String,String> map_body = new HashMap<String,String>();
private int header_row;
private String file_to_save ;
private boolean replace_currency = false;
private String from_currency, to_currency;
private boolean remove_empty, remove_tmp_file;
private String remove_id;
private int column_id;
private int currency_column;
private int sheet_number;
private String tmp_file;
private String argument;
private String identifier;
private HttpCSVCollectorPlugin collector;
public HttpCSVCollectorPlugin getCollector() {
return collector;
}
public void setCollector(HttpCSVCollectorPlugin collector) {
this.collector = collector;
}
public Read(InterfaceDescriptor descriptor){
this.descriptor = descriptor;
}
private static String getCellValue( Cell cell)
{
DataFormatter formatter = new DataFormatter();
String formattedCellValue = formatter.formatCellValue(cell);
return formattedCellValue;
}
private void copyFile() throws IOException{
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
}
private void parseDescriptor(){
HashMap<String, String> params = descriptor.getParams();
argument = params.get("argument");
header_row = Integer.parseInt(params.get("header_row"));
tmp_file = params.get("tmp_file");
remove_empty = (params.get("remove_empty_lines") == "yes");
remove_id = params.get("remove_lines_with_id");
column_id = Integer.parseInt(params.get("col_id"));
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
sheet_number = Integer.parseInt(params.get("sheet_number"));
file_to_save = params.get("file_to_save");
}
private void init() throws IOException{
parseDescriptor();
log.info("Parsing the arguments");
parseArguments();
log.info("Copying the file in temp local file");
copyFile();
log.info("Extracting the sheet " + sheet_number);
FileInputStream fis = new FileInputStream(tmp_file);
Workbook workbook = new XSSFWorkbook(fis);
sheet = workbook.getSheetAt(sheet_number);
fis.close();
if(remove_tmp_file) {
File f = new File(tmp_file);
f.delete();
}
}
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
try{
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
for(Object entry: arr)
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
}catch(Throwable e){
log.error("Problems filling the map for " + elem);
throw(e);
}
}
private void parseArguments() {
if (StringUtils.isNotEmpty(argument)){
try{
final JSONObject json = new JSONObject(argument);
if(json.has("header"))
fillMap(json, map_header,"header");
if (json.has("body"))
fillMap(json,map_body,"body");
if(json.has("replace_currency"))
{
replace_currency = true ;
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
}
if (json.has("col_currency"))
currency_column = json.getInt("col_currency");
}catch(Throwable e){
log.error("Problems while parsing the argument parameter.");
throw (e);
}
}
}
private String applyReplace(String row, HashMap<String,String>replace){
for(String key: replace.keySet()){
if(row.contains(key))
row = row.replace(key, replace.get(key));
}
return row;
}
private void getHeader(){
Row row = sheet.getRow(header_row);
Iterator<Cell> cellIterator = row.cellIterator();
Cell cell;
String project = "";
int count = 0;
while (cellIterator.hasNext()){
cell = cellIterator.next();
final String stringCellValue = cell.getStringCellValue();
project += applyReplace(stringCellValue,map_header) + ";";
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
}
project = project.substring(0, project.length() -1 );
csv_writer.setHeader(project.split(";"));
}
private void getData(){
Row row;
Cell cell;
String tmp;
Iterator<Cell>cellIterator;
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
row = sheet.getRow(row_number);
if (row != null) {
cellIterator = row.cellIterator();
int col_number = 0;
boolean discard_row = false;
ArrayList<String> al = new ArrayList<String>();
while (cellIterator.hasNext() && !discard_row) {
cell = cellIterator.next();
tmp = getCellValue(cell).trim();
tmp = tmp.replace("\n"," ");
if (col_number == column_id &&
((remove_empty && tmp.trim().equals("")) ||
(!remove_id.equals("") && tmp.equals(remove_id))))
discard_row = true;
if (replace_currency && col_number == currency_column)
tmp = tmp.replace(from_currency, to_currency);
al.add(applyReplace(tmp, map_body));
col_number++;
}
if (!discard_row) {
csv_writer.addProject(al);
}
}
}
}
private void writeCSVFile(){
csv_writer.writeFile(file_to_save);
}
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
InterfaceDescriptor dex = new InterfaceDescriptor();
dex.setBaseUrl("file://"+file_to_save);
HashMap<String, String> params = new HashMap<String, String>();
params.put("separator", descriptor.getParams().get("separator"));
params.put("identifier",identifier);
params.put("quote",descriptor.getParams().get("quote"));
dex.setParams(params);
return dex;
}
public Iterable<String> parseFile() throws Exception{
init();
log.info("Getting header elements");
getHeader();
log.info("Getting sheet data");
getData();
log.info("Writing the csv file");
writeCSVFile();
log.info("Preparing to parse csv");
return collector.collect(prepareHTTPCSVDescriptor(),"","");
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.data.collector.plugins.excel;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Required;
/**
* Created by miriam on 10/05/2017.
*/
public class ReadExcelPlugin extends AbstractCollectorPlugin{
private static final Log log = LogFactory.getLog(ReadExcelPlugin.class);
@Autowired
HttpCSVCollectorPlugin httpCSVCollectorPlugin;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
Read r = new Read(interfaceDescriptor);
r.setCollector(httpCSVCollectorPlugin);
try {
return r.parseFile();
}catch(Exception e){
log.error("Error importing excel file");
throw new CollectorServiceException(e);
}
}
}

View File

@ -0,0 +1,27 @@
/**
*
*/
package eu.dnetlib.data.collector.plugins.filesfrommetadata;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* @author sandro
*
*/
public class FilesFromMetadataCollectorPlugin extends AbstractCollectorPlugin {
/**
* {@inheritDoc}
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String, java.lang.String)
*/
@Override
public Iterable<String> collect(final InterfaceDescriptor arg0, final String arg1, final String arg2) throws CollectorServiceException {
// TODO Auto-generated method stub
return null;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.data.collector.plugins.filesfrommetadata;
import java.util.List;
import java.util.Map;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import eu.dnetlib.data.collector.functions.ParamValuesFunction;
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.enabling.locators.UniqueServiceLocator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
/**
* Created by alessia on 17/12/15.
*/
public class PopulateFileDownloadBasePath implements ParamValuesFunction {
private static final Log log = LogFactory.getLog(PopulateFileDownloadBasePath.class);
@Autowired
private UniqueServiceLocator serviceLocator;
@Value("${services.objectstore.basePathList.xquery}")
private String xQueryForObjectStoreBasePath;
@Override
public List<ProtocolParameterValue> findValues(final String s, final Map<String, String> map) {
try {
return Lists.transform(serviceLocator.getService(ISLookUpService.class).quickSearchProfile(xQueryForObjectStoreBasePath),
new Function<String, ProtocolParameterValue>() {
@Override
public ProtocolParameterValue apply(final String s) {
return new ProtocolParameterValue(s, s);
}
});
} catch (ISLookUpException e) {
log.error("Cannot read Object store service properties", e);
}
return Lists.newArrayList();
}
public UniqueServiceLocator getServiceLocator() {
return serviceLocator;
}
public void setServiceLocator(final UniqueServiceLocator serviceLocator) {
this.serviceLocator = serviceLocator;
}
public String getxQueryForObjectStoreBasePath() {
return xQueryForObjectStoreBasePath;
}
public void setxQueryForObjectStoreBasePath(final String xQueryForObjectStoreBasePath) {
this.xQueryForObjectStoreBasePath = xQueryForObjectStoreBasePath;
}
}

View File

@ -0,0 +1,89 @@
package eu.dnetlib.data.collector.plugins.filesystem;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
/**
* Class enabling lazy and recursive iteration of a filesystem tree. The iterator iterates over file paths.
*
* @author Andrea
*
*/
public class FileSystemIterator implements Iterator<String> {
/** The logger */
private static final Log log = LogFactory.getLog(FileSystemIterator.class);
private Set<String> extensions = Sets.newHashSet();
private Iterator<Path> pathIterator;
private String current;
public FileSystemIterator(final String baseDir, final String extensions) {
if(StringUtils.isNotBlank(extensions)) {
this.extensions = Sets.newHashSet(extensions.split(","));
}
try {
this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator();
this.current = walkTillNext();
} catch (IOException e) {
log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir);
throw new RuntimeException("Filesystem collection error.", e);
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public synchronized String next() {
String pivot = new String(current);
current = walkTillNext();
log.debug("Returning: " + pivot);
return pivot;
}
@Override
public void remove() {}
/**
* Walk the filesystem recursively until it finds a candidate. Strategies: a) For any directory found during the walk, an iterator is
* built and concat to the main one; b) Any file is checked against admitted extensions
*
* @return the next element to be returned by next call of this.next()
*/
private synchronized String walkTillNext() {
while (pathIterator.hasNext()) {
Path nextFilePath = pathIterator.next();
if (Files.isDirectory(nextFilePath)) {
// concat
try {
pathIterator = Iterators.concat(pathIterator, Files.newDirectoryStream(nextFilePath).iterator());
log.debug("Adding folder iterator: " + nextFilePath.toString());
} catch (IOException e) {
log.error("Cannot create folder iterator! Is this path correct? " + nextFilePath.toString());
return null;
}
} else {
if (extensions.isEmpty() || extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) {
log.debug("Returning: " + nextFilePath.toString());
return nextFilePath.toString();
}
}
}
return null;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.data.collector.plugins.filesystem;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
*
* @author andrea
*
*/
public class FilesystemCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
return new FilesystemIterable(interfaceDescriptor);
}
}

View File

@ -0,0 +1,139 @@
package eu.dnetlib.data.collector.plugins.filesystem;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.ximpleware.*;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import org.json.XML;
/**
* The Class FilesystemIterable.
*
* @author Sandro, Michele, Andrea
*/
public class FilesystemIterable implements Iterable<String> {
/**
* The Constant log.
*/
private static final Log log = LogFactory.getLog(FilesystemIterable.class);
/**
* The base dir.
*/
private File baseDir;
/**
* The extensions.
*/
private String extensions;
/**
* File format (json / xml)
**/
private String fileFormat = "xml";
private List<String> supportedFormats = Lists.newArrayList("xml", "json");
private boolean setObjIdentifierFromFileName = false;
/**
* Instantiates a new filesystem iterable.
*
* @param descriptor the descriptor
* @throws CollectorServiceException the collector service exception
*/
public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
try {
final String baseUrl = descriptor.getBaseUrl();
URL basePath = new URL(baseUrl);
this.baseDir = new File(basePath.getPath());
if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
this.extensions = descriptor.getParams().get("extensions");
if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
if (!supportedFormats.contains(fileFormat))
throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils
.join(supportedFormats, ','));
if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
}
} catch (MalformedURLException e) {
throw new CollectorServiceException("Filesystem collector failed! ", e);
}
}
/**
* {@inheritDoc}
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
return Iterators.transform(fsi, inputFileName -> {
FileInputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(inputFileName);
final String s = IOUtils.toString(fileInputStream);
if (fileFormat.equalsIgnoreCase("json")) {
JSONObject json = new JSONObject(s);
JSONObject obj = new JSONObject();
if (setObjIdentifierFromFileName) {
obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName)));
}
obj.put("metadata", json);
log.debug(obj.toString());
return XML.toString(obj, "record");
}
String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
if (setObjIdentifierFromFileName) {
return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
} else return cleanedXML;
} catch (VTDException e) {
log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
return "";
} catch (Exception e) {
log.error("Unable to read " + inputFileName);
return "";
} finally {
if (fileInputStream != null) {
try {
fileInputStream.close();
} catch (IOException e) {
log.error("Unable to close inputstream for " + inputFileName);
}
}
}
});
}
private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException {
VTDGen vg = new VTDGen(); // Instantiate VTDGen
XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier
vg.setDoc(xml.getBytes("UTF-8"));
vg.parse(false);
VTDNav vn = vg.getNav();
xm.bind(vn);
if (vn.toElement(VTDNav.ROOT)) {
xm.insertBeforeElement("<record><header><objIdentifier>" + objidentifier + "</objIdentifier></header><metadata>");
xm.insertAfterElement("</metadata></record>");
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
xm.output(baos);
return baos.toString("UTF-8");
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.data.collector.plugins.ftp;
import java.util.Iterator;
import java.util.Set;
import com.google.common.base.Splitter;
import com.google.common.collect.Sets;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.springframework.beans.factory.annotation.Required;
/**
*
* @author Author: Andrea Mannocci
*
*/
public class FtpCollectorPlugin extends AbstractCollectorPlugin {
private FtpIteratorFactory ftpIteratorFactory;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
final String username = interfaceDescriptor.getParams().get("username");
final String password = interfaceDescriptor.getParams().get("password");
final String recursive = interfaceDescriptor.getParams().get("recursive");
final String extensions = interfaceDescriptor.getParams().get("extensions");
if ((baseUrl == null) || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
if ((username == null) || username.isEmpty()) { throw new CollectorServiceException("Param 'username' is null or empty"); }
if ((password == null) || password.isEmpty()) { throw new CollectorServiceException("Param 'password' is null or empty"); }
if ((recursive == null) || recursive.isEmpty()) { throw new CollectorServiceException("Param 'recursive' is null or empty"); }
if ((extensions == null) || extensions.isEmpty()) { throw new CollectorServiceException("Param 'extensions' is null or empty"); }
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
return new Iterable<String>() {
boolean isRecursive = "true".equals(recursive);
Set<String> extensionsSet = parseSet(extensions);
@Override
public Iterator<String> iterator() {
return getFtpIteratorFactory().newIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
}
private Set<String> parseSet(final String extensions) {
return Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(extensions));
}
};
}
public FtpIteratorFactory getFtpIteratorFactory() {
return ftpIteratorFactory;
}
@Required
public void setFtpIteratorFactory(final FtpIteratorFactory ftpIteratorFactory) {
this.ftpIteratorFactory = ftpIteratorFactory;
}
}

View File

@ -0,0 +1,208 @@
package eu.dnetlib.data.collector.plugins.ftp;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPFile;
import org.apache.commons.net.ftp.FTPReply;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
*
* @author Author: Andrea Mannocci
*
*/
public class FtpIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(FtpIterator.class);
private static final int MAX_RETRIES = 5;
private static final int DEFAULT_TIMEOUT = 30000;
private static final long BACKOFF_MILLIS = 10000;
private FTPClient ftpClient;
private String ftpServerAddress;
private String remoteFtpBasePath;
private String username;
private String password;
private boolean isRecursive;
private Set<String> extensionsSet;
private boolean incremental;
private DateTime fromDate = null;
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
private Queue<String> queue;
public FtpIterator(final String baseUrl, final String username, final String password, final boolean isRecursive,
final Set<String> extensionsSet, String fromDate) {
this.username = username;
this.password = password;
this.isRecursive = isRecursive;
this.extensionsSet = extensionsSet;
this.incremental = StringUtils.isNotBlank(fromDate);
if (incremental) {
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
}
try {
URL server = new URL(baseUrl);
this.ftpServerAddress = server.getHost();
this.remoteFtpBasePath = server.getPath();
} catch (MalformedURLException e1) {
throw new CollectorServiceRuntimeException("Malformed URL exception " + baseUrl);
}
connectToFtpServer();
initializeQueue();
}
private void connectToFtpServer() {
ftpClient = new FTPClient();
ftpClient.setDefaultTimeout(DEFAULT_TIMEOUT);
ftpClient.setDataTimeout(DEFAULT_TIMEOUT);
ftpClient.setConnectTimeout(DEFAULT_TIMEOUT);
try {
ftpClient.connect(ftpServerAddress);
// try to login
if (!ftpClient.login(username, password)) {
ftpClient.logout();
throw new CollectorServiceRuntimeException("Unable to login to FTP server " + ftpServerAddress);
}
int reply = ftpClient.getReplyCode();
if (!FTPReply.isPositiveCompletion(reply)) {
ftpClient.disconnect();
throw new CollectorServiceRuntimeException("Unable to connect to FTP server " + ftpServerAddress);
}
ftpClient.enterLocalPassiveMode();
log.info("Connected to FTP server " + ftpServerAddress);
log.info(String.format("FTP collecting from %s with recursion = %s", remoteFtpBasePath, isRecursive));
} catch (IOException e) {
throw new CollectorServiceRuntimeException("Unable to connect to FTP server " + ftpServerAddress);
}
}
private void disconnectFromFtpServer() {
try {
if (ftpClient.isConnected()) {
ftpClient.logout();
ftpClient.disconnect();
}
} catch (IOException e) {
log.error("Failed to logout & disconnect from the FTP server", e);
}
}
private void initializeQueue() {
queue = new LinkedList<String>();
listDirectoryRecursive(remoteFtpBasePath, "");
}
private void listDirectoryRecursive(final String parentDir, final String currentDir) {
String dirToList = parentDir;
if (!currentDir.equals("")) {
dirToList += "/" + currentDir;
}
FTPFile[] subFiles;
try {
subFiles = ftpClient.listFiles(dirToList);
if ((subFiles != null) && (subFiles.length > 0)) {
for (FTPFile aFile : subFiles) {
String currentFileName = aFile.getName();
if (currentFileName.equals(".") || currentFileName.equals("..")) {
// skip parent directory and directory itself
continue;
}
if (aFile.isDirectory()) {
if (isRecursive) {
listDirectoryRecursive(dirToList, currentFileName);
}
} else {
// test the file for extensions compliance and, just in case, add it to the list.
for (String ext : extensionsSet) {
if (currentFileName.endsWith(ext)) {
//incremental mode: let's check the last update date
if(incremental){
Calendar timestamp = aFile.getTimestamp();
DateTime lastModificationDate = new DateTime(timestamp);
if(lastModificationDate.isAfter(fromDate)){
queue.add(dirToList + "/" + currentFileName);
log.debug(currentFileName + " has changed and must be re-collected");
} else {
if (log.isDebugEnabled()) {
log.debug(currentFileName + " has not changed since last collection");
}
}
}
else {
//not incremental: just add it to the queue
queue.add(dirToList + "/" + currentFileName);
}
}
}
}
}
}
} catch (IOException e) {
throw new CollectorServiceRuntimeException("Unable to list FTP remote folder", e);
}
}
@Override
public boolean hasNext() {
if (queue.isEmpty()) {
disconnectFromFtpServer();
return false;
} else {
return true;
}
}
@Override
public String next() {
String nextRemotePath = queue.remove();
int nRepeat = 0;
while (nRepeat < MAX_RETRIES) {
try {
OutputStream baos = new ByteArrayOutputStream();
if (!ftpClient.isConnected()) {
connectToFtpServer();
}
ftpClient.retrieveFile(nextRemotePath, baos);
log.debug(String.format("Collected file from FTP: %s%s", ftpServerAddress, nextRemotePath));
return baos.toString();
} catch (IOException e) {
nRepeat++;
log.warn(String.format("An error occurred [%s] for %s%s, retrying.. [retried %s time(s)]", e.getMessage(), ftpServerAddress, nextRemotePath,
nRepeat));
disconnectFromFtpServer();
try {
Thread.sleep(BACKOFF_MILLIS);
} catch (InterruptedException e1) {
log.error(e1);
}
}
}
throw new CollectorServiceRuntimeException(String.format("Impossible to retrieve FTP file %s after %s retries. Aborting FTP collection.", nextRemotePath, nRepeat));
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.data.collector.plugins.ftp;
import java.util.Iterator;
import java.util.Set;
/**
*
* @author Author: Andrea Mannocci
*
*/
public class FtpIteratorFactory {
public Iterator<String> newIterator(final String baseUrl,
final String username,
final String password,
final boolean isRecursive,
final Set<String> extensionsSet, final String fromDate) {
return new FtpIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.data.collector.plugins.httpfilename;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
/**
* Created by miriam on 07/05/2018.
*/
public class Connector extends HttpConnector implements ConnectorInterface {
private String response;
@Override
public void get(final String requestUrl) throws CollectorServiceException {
response = getInputSource(requestUrl);
}
@Override
public String getResponse() {
return response;
}
@Override
public boolean isStatusOk() {
return (response != null);
}
@Override
public boolean responseTypeContains(String string) {
String responseType = getResponseType();
if (responseType != null)
return responseType.contains(string);
return false;
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.data.collector.plugins.httpfilename;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
/**
* Created by miriam on 07/05/2018.
*/
public interface ConnectorInterface {
void get(final String requestUrl) throws CollectorServiceException;
String getResponse();
boolean isStatusOk();
boolean responseTypeContains(String string);
}

View File

@ -0,0 +1,190 @@
package eu.dnetlib.data.collector.plugins.httpfilename;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import org.json.XML;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Created by miriam on 04/05/2018.
*/
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
public static final String APP_JSON = "application/json";
public static final String APP_XML = "application/xml";
public static final String TEXT_HTML = "text/html";
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
private String filterParam;
int total = 0;
int filtered = 0;
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
this.filterParam = filter;
Thread ft = new Thread(new FillMetaQueue(startUrl) );
ft.start();
}
@Override
public Iterator<String> iterator() {
return new HttpWithFileNameCollectorIterator(queue);
}
private class FillMetaQueue implements Runnable {
final Connector c = new Connector();
private final List<String> metas = Collections.synchronizedList(new ArrayList<String>());
private final List<String> urls = Collections.synchronizedList(new ArrayList<>());
public FillMetaQueue(String startUrl){
if(!startUrl.isEmpty()){
urls.add(startUrl);
}
}
public void fillQueue() {
String url;
while((metas.size()>0 || urls.size() > 0 )) {
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
if (metas.size() > 0) {
url = metas.remove(0);
try {
c.get(url);
} catch (CollectorServiceException e) {
log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
}
if(c.isStatusOk()){
try {
String ret = c.getResponse();
if (ret != null && ret.length()>0) {
if (!containsFilter(ret))
queue.put(addFilePath(ret, url, url.endsWith(".json")));
//queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
else
filtered++;
total++;
}
} catch (InterruptedException e) {
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
}
}
} else {
url = urls.remove(0);
try {
c.get(url);
} catch (CollectorServiceException e) {
log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
}
if(c.isStatusOk()) {
if (c.responseTypeContains(TEXT_HTML)){
recurFolder(c.getResponse(), url);
} else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
try {
final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
//queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
queue.put(element);
} catch (InterruptedException e) {
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
}
}
}
}
}
try {
//queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
queue.put(HttpWithFileNameCollectorIterator.TERMINATOR);
} catch (InterruptedException e) {
throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e);
}
}
private boolean containsFilter(String meta){
if (filterParam == null || filterParam.isEmpty())
return false;
String[] filter = filterParam.split(";");
for(String item:filter){
if (meta.contains(item))
return true;
}
return false;
}
private String addFilePath(String meta, String url, boolean isJson){
String path = url.replace("metadata", "pdf");
try {
if(isJson)
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
else {
if (meta.contains("<!DOCTYPE")) {
meta = meta.substring(meta.indexOf("<!DOCTYPE"));
meta = meta.substring(meta.indexOf(">") + 1);
}
int index = meta.lastIndexOf("</");
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
}
} catch(Exception ex) {
log.info("not file with extension .json or .xml");
}
if(isJson) {
try {
return XML.toString(new JSONObject("{'resource':" + meta + "}"));
} catch(Exception e) {
log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
// throw new RuntimeException();
final String junk = String.format(JUNK, url);
log.warn("returning " + junk);
return junk;
}
}
return meta;
}
private void recurFolder(String text, String url){
Document doc = Jsoup.parse(text);
Elements links = doc.select("a");
for(Element e:links){
if (!e.text().equals("../")){
String file = e.attr("href");
if(file.endsWith(".json") || file.endsWith(".xml"))
metas.add(url+file);
else
urls.add(url+file);
}
}
}
@Override
public void run() {
fillQueue();
}
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.data.collector.plugins.httpfilename;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Created by miriam on 04/05/2018.
*/
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException {
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter"));
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.data.collector.plugins.httpfilename;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
/**
* Created by miriam on 25/06/2018.
*/
public class HttpWithFileNameCollectorIterator implements Iterator<String> {
public static final String TERMINATOR = "FINITO";
private static final Log log = LogFactory.getLog(HttpWithFileNameCollectorIterator.class);
private final ArrayBlockingQueue<String> queue;
public static final long waitTime = 60L;
private String last = "<resource><DOI>JUNK</DOI></resource>";
public HttpWithFileNameCollectorIterator(ArrayBlockingQueue<String> queue) {
this.queue = queue;
extractFromQueue();
}
@Override
public boolean hasNext() {
//return !(Objects.equals(last, TERMINATOR) || Objects.equals(last,null));
return !(Objects.equals(last, TERMINATOR));
}
@Override
public String next() {
try{
return last;
}finally{
extractFromQueue();
}
}
private void extractFromQueue() {
try {
last = queue.take();
//last = queue.poll(waitTime, TimeUnit.SECONDS);
}catch(InterruptedException e){
log.warn("Interrupted while waiting for element to consume");
throw new NoSuchElementException(e.getMessage());
}
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.data.collector.plugins.httplist;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.springframework.beans.factory.annotation.Autowired;
public class HttpListCollectorPlugin extends AbstractCollectorPlugin {
@Autowired
private HttpConnector httpConnector;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
final String listAddress = interfaceDescriptor.getParams().get("listUrl");
return () -> new HttpListIterator(baseUrl, listAddress, httpConnector);
}
}

View File

@ -0,0 +1,64 @@
package eu.dnetlib.data.collector.plugins.httplist;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.lang3.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;
public class HttpListIterator implements Iterator<String> {
private HttpConnector httpConnector;
private String baseUrl;
private String currentLine;
private BufferedReader reader;
public HttpListIterator(final String baseUrl, final String listAddress, final HttpConnector httpConnector) {
try {
this.baseUrl = baseUrl;
this.reader = new BufferedReader(new StringReader(download(listAddress)));
this.httpConnector = httpConnector;
this.currentLine = reader.readLine();
} catch (Exception e) {
throw new RuntimeException("Error creating iterator", e);
}
}
@Override
public synchronized boolean hasNext() {
return StringUtils.isNotBlank(currentLine);
}
@Override
public synchronized String next() {
try {
if (StringUtils.isNotBlank(currentLine)) {
return download(baseUrl + currentLine);
} else {
throw new RuntimeException("Iterator has reached the end");
}
} finally {
try {
this.currentLine = reader.readLine();
} catch (IOException e) {
throw new RuntimeException("Error obtaining next element " + currentLine, e);
}
}
}
private String download(final String url) {
try {
return httpConnector.getInputSource(url);
} catch (CollectorServiceException e) {
throw new RuntimeException(e);
}
}
@Override
public void remove() {}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.data.collector.plugins.mongo;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.Iterator;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.information.collectionservice.rmi.CollectionServiceException;
/**
* The Class MongoDumpIterable.
*/
public class MongoDumpIterable implements Iterable<String> {
/** The input stream. */
private final FileReader inputStream;
/**
* Instantiates a new mongo dump iterable.
*
* @param inputFile the input file
* @throws CollectionServiceException the collection service exception
*/
public MongoDumpIterable(final File inputFile) throws CollectorServiceException {
try {
this.inputStream = new FileReader(inputFile);
} catch (FileNotFoundException e) {
throw new CollectorServiceException("Error unable to open inputStream", e);
}
}
/*
* (non-Javadoc)
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
return new MongoDumpIterator(inputStream);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.data.collector.plugins.mongo;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.Iterator;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
public class MongoDumpIterator implements Iterator<String> {
private final BufferedReader inputStream;
private String currentLine = null;
public MongoDumpIterator(final FileReader inputStream) {
this.inputStream = new BufferedReader(inputStream);
this.currentLine = getNextLine();
}
@Override
public boolean hasNext() {
return currentLine != null;
}
@Override
public String next() {
final String returnedString = this.currentLine;
this.currentLine = getNextLine();
return returnedString;
}
@Override
public void remove() {
// TODO Auto-generated method stub
}
private String getNextLine() {
try {
String input = inputStream.readLine();
while (input != null) {
JsonElement jElement = new JsonParser().parse(input);
JsonObject jobject = jElement.getAsJsonObject();
if (jobject.has("body")) { return jobject.get("body").getAsString(); }
input = inputStream.readLine();
}
return null;
} catch (IOException e) {
return null;
}
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.data.collector.plugins.mongo;
import java.io.File;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
public class MongoDumpPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
final File f = new File(baseUrl);
if (f.exists() == false) { throw new CollectorServiceException("the file at url " + baseUrl + " does not exists"); }
return new MongoDumpIterable(f);
}
}

View File

@ -0,0 +1,58 @@
package eu.dnetlib.data.collector.plugins.oai;
import java.util.List;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.springframework.beans.factory.annotation.Required;
public class OaiCollectorPlugin extends AbstractCollectorPlugin {
private static final String FORMAT_PARAM = "format";
private static final String OAI_SET_PARAM = "set";
private OaiIteratorFactory oaiIteratorFactory;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
final String mdFormat = interfaceDescriptor.getParams().get(FORMAT_PARAM);
final String setParam = interfaceDescriptor.getParams().get(OAI_SET_PARAM);
final List<String> sets = Lists.newArrayList();
if (setParam != null) {
sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
}
if (sets.isEmpty()) {
// If no set is defined, ALL the sets must be harvested
sets.add("");
}
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
if (mdFormat == null || mdFormat.isEmpty()) { throw new CollectorServiceException("Param 'mdFormat' is null or empty"); }
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + untilDate); }
return () -> Iterators.concat(
sets.stream()
.map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
.iterator());
}
public OaiIteratorFactory getOaiIteratorFactory() {
return oaiIteratorFactory;
}
@Required
public void setOaiIteratorFactory(final OaiIteratorFactory oaiIteratorFactory) {
this.oaiIteratorFactory = oaiIteratorFactory;
}
}

View File

@ -0,0 +1,168 @@
package eu.dnetlib.data.collector.plugins.oai;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
public class OaiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
private Queue<String> queue = new PriorityBlockingQueue<String>();
private SAXReader reader = new SAXReader();
private String baseUrl;
private String set;
private String mdFormat;
private String fromDate;
private String untilDate;
private String token;
private boolean started;
private HttpConnector httpConnector;
public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, final HttpConnector httpConnector) {
this.baseUrl = baseUrl;
this.mdFormat = mdFormat;
this.set = set;
this.fromDate = fromDate;
this.untilDate = untilDate;
this.started = false;
this.httpConnector = httpConnector;
}
private void verifyStarted() {
if (!this.started) {
this.started = true;
try {
this.token = firstPage();
} catch (CollectorServiceException e) {
throw new RuntimeException(e);
}
}
}
@Override
public boolean hasNext() {
synchronized (queue) {
verifyStarted();
return !queue.isEmpty();
}
}
@Override
public String next() {
synchronized (queue) {
verifyStarted();
final String res = queue.poll();
while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
try {
token = otherPages(token);
} catch (CollectorServiceException e) {
throw new RuntimeException(e);
}
}
return res;
}
}
@Override
public void remove() {}
private String firstPage() throws CollectorServiceException {
try {
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat,"UTF-8");
if ((set != null) && !set.isEmpty()) {
url += "&set=" + URLEncoder.encode(set,"UTF-8");
}
if ((fromDate != null) && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
url += "&from=" + URLEncoder.encode(fromDate,"UTF-8");
}
if ((untilDate != null) && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
url += "&until=" + URLEncoder.encode(untilDate,"UTF-8");
}
log.info("Start harvesting using url: " + url);
return downloadPage(url);
} catch(UnsupportedEncodingException e) {
throw new CollectorServiceException(e);
}
}
private String extractResumptionToken(final String xml) {
final String s = StringUtils.substringAfter(xml, "<resumptionToken");
if (s == null){
return null;
}
final String result = StringUtils.substringBetween(s, ">", "</");
if (result == null)
return null;
return result.trim();
}
private String otherPages(final String resumptionToken) throws CollectorServiceException {
try {
return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken,"UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new CollectorServiceException(e);
}
}
private String downloadPage(final String url) throws CollectorServiceException {
final String xml = httpConnector.getInputSource(url);
Document doc;
try {
doc = reader.read(new StringReader(xml));
} catch (DocumentException e) {
log.warn("Error parsing xml, I try to clean it: " + xml, e);
final String cleaned = XmlCleaner.cleanAllEntities(xml);
try {
doc = reader.read(new StringReader(cleaned));
} catch (DocumentException e1) {
final String resumptionToken = extractResumptionToken(xml);
if (resumptionToken == null)
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
return resumptionToken;
}
}
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
if (errorNode != null) {
final String code = errorNode.valueOf("@code");
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
log.warn("noRecordsMatch for oai call: " + url);
return null;
} else {
throw new CollectorServiceException(code + " - " + errorNode.getText());
}
}
for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
queue.add(((Node) o).asXML());
}
return doc.valueOf("//*[local-name()='resumptionToken']");
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.data.collector.plugins.oai;
import java.util.Iterator;
import org.springframework.beans.factory.annotation.Required;
import eu.dnetlib.data.collector.plugins.HttpConnector;
public class OaiIteratorFactory {
private HttpConnector httpConnector;
public Iterator<String> newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) {
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector);
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
@Required
public void setHttpConnector(HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
}

View File

@ -0,0 +1,268 @@
package eu.dnetlib.data.collector.plugins.oai.engine;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
/**
* @author jochen, Andreas Czerniak
*
*/
public class XmlCleaner {
/**
* Pattern for numeric entities.
*/
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to &#11;
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
/**
* Pattern that negates the allowable XML 4 byte unicode characters. Valid
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
* [#x10000-#x10FFFF]
*/
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
// Map entities to their unicode equivalent
private static Set<String> goodEntities = new HashSet<String>();
private static Map<String, String> badEntities = new HashMap<String, String>();
static {
// pre-defined XML entities
goodEntities.add("&quot;"); //$NON-NLS-1$ // quotation mark
goodEntities.add("&amp;"); //$NON-NLS-1$ // ampersand
goodEntities.add("&lt;"); //$NON-NLS-1$ // less-than sign
goodEntities.add("&gt;"); //$NON-NLS-1$ // greater-than sign
// control entities
//badEntities.put("&#11;", "");
badEntities.put("&#127;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#128;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#129;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#130;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#131;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#132;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#133;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#134;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#135;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#136;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#137;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#138;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#139;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#140;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#141;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#142;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#143;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#144;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#145;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#146;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#147;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#148;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#149;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#150;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#151;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#152;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#153;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#154;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#155;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#156;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#157;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#158;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
badEntities.put("&#159;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
// misc entities
badEntities.put("&euro;", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
badEntities.put("&lsquo;", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
badEntities.put("&rsquo;", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
// Latin 1 entities
badEntities.put("&nbsp;", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
badEntities.put("&iexcl;", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
badEntities.put("&cent;", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
badEntities.put("&pound;", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
badEntities.put("&curren;", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
badEntities.put("&yen;", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
badEntities.put("&brvbar;", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
badEntities.put("&sect;", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
badEntities.put("&uml;", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
badEntities.put("&copy;", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
badEntities.put("&ordf;", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
badEntities.put("&laquo;", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
badEntities.put("&not;", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
badEntities.put("&shy;", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
badEntities.put("&reg;", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
badEntities.put("&macr;", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
badEntities.put("&deg;", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
badEntities.put("&plusmn;", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
badEntities.put("&sup2;", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
badEntities.put("&sup3;", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
badEntities.put("&acute;", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
badEntities.put("&micro;", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
badEntities.put("&para;", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
badEntities.put("&middot;", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
badEntities.put("&cedil;", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
badEntities.put("&sup1;", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
badEntities.put("&ordm;", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
badEntities.put("&raquo;", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
badEntities.put("&frac14;", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
badEntities.put("&frac12;", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
badEntities.put("&frac34;", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
badEntities.put("&iquest;", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
badEntities.put("&Agrave;", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
badEntities.put("&Aacute;", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
badEntities.put("&Acirc;", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
badEntities.put("&Atilde;", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
badEntities.put("&Auml;", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
badEntities.put("&Aring;", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
badEntities.put("&AElig;", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
badEntities.put("&Ccedil;", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
badEntities.put("&Egrave;", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
badEntities.put("&Eacute;", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
badEntities.put("&Ecirc;", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
badEntities.put("&Euml;", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
badEntities.put("&Igrave;", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
badEntities.put("&Iacute;", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
badEntities.put("&Icirc;", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
badEntities.put("&Iuml;", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
badEntities.put("&ETH;", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
badEntities.put("&Ntilde;", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
badEntities.put("&Ograve;", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
badEntities.put("&Oacute;", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
badEntities.put("&Ocirc;", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
badEntities.put("&Otilde;", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
badEntities.put("&Ouml;", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
badEntities.put("&times;", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
badEntities.put("&Oslash;", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
badEntities.put("&Ugrave;", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
badEntities.put("&Uacute;", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
badEntities.put("&Ucirc;", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
badEntities.put("&Uuml;", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
badEntities.put("&Yacute;", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
badEntities.put("&THORN;", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
badEntities.put("&szlig;", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
badEntities.put("&agrave;", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
badEntities.put("&aacute;", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
badEntities.put("&acirc;", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
badEntities.put("&atilde;", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
badEntities.put("&auml;", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
badEntities.put("&aring;", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
badEntities.put("&aelig;", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
badEntities.put("&ccedil;", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
badEntities.put("&egrave;", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
badEntities.put("&eacute;", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
badEntities.put("&ecirc;", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
badEntities.put("&euml;", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
badEntities.put("&igrave;", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
badEntities.put("&iacute;", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
badEntities.put("&icirc;", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
badEntities.put("&iuml;", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
badEntities.put("&eth;", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
badEntities.put("&ntilde;", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
badEntities.put("&ograve;", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
badEntities.put("&oacute;", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
badEntities.put("&ocirc;", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
badEntities.put("&otilde;", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
badEntities.put("&ouml;", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
badEntities.put("&divide;", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
badEntities.put("&oslash;", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
badEntities.put("&ugrave;", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
badEntities.put("&uacute;", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
badEntities.put("&ucirc;", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
badEntities.put("&uuml;", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
badEntities.put("&yacute;", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
badEntities.put("&thorn;", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
badEntities.put("&yuml;", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
}
/**
* For each entity in the input that is not allowed in XML, replace the
* entity with its unicode equivalent or remove it. For each instance of a
* bare {@literal &}, replace it with {@literal &amp;<br/>}
* XML only allows 4 entities: {@literal &amp;amp;}, {@literal &amp;quot;}, {@literal &amp;lt;} and {@literal &amp;gt;}.
*
* @param broken
* the string to handle entities
* @return the string with entities appropriately fixed up
*/
static public String cleanAllEntities(final String broken) {
if (broken == null) {
return null;
}
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
working = invalidCharacterPattern.matcher(working).replaceAll("");
int cleanfrom = 0;
while (true) {
int amp = working.indexOf('&', cleanfrom);
// If there are no more amps then we are done
if (amp == -1) {
break;
}
// Skip references of the kind &#ddd;
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
cleanfrom = working.indexOf(';', amp) + 1;
continue;
}
int i = amp + 1;
while (true) {
// if we are at the end of the string then just escape the '&';
if (i >= working.length()) {
return working.substring(0, amp) + "&amp;" + working.substring(amp + 1); //$NON-NLS-1$
}
// if we have come to a ; then we have an entity
// If it is something that xml can't handle then replace it.
char c = working.charAt(i);
if (c == ';') {
final String entity = working.substring(amp, i + 1);
final String replace = handleEntity(entity);
working = working.substring(0, amp) + replace + working.substring(i + 1);
break;
}
// Did we end an entity without finding a closing ;
// Then treat it as an '&' that needs to be replaced with &amp;
if (!Character.isLetterOrDigit(c)) {
working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1); //$NON-NLS-1$
amp = i + 4; // account for the 4 extra characters
break;
}
i++;
}
cleanfrom = amp + 1;
}
if (Pattern.compile("<<").matcher(working).find()) {
working = working.replaceAll("<<", "&lt;&lt;");
}
if (Pattern.compile(">>").matcher(working).find()) {
working = working.replaceAll(">>", "&gt;&gt;");
}
return working;
}
/**
* Replace entity with its unicode equivalent, if it is not a valid XML
* entity. Otherwise strip it out. XML only allows 4 entities: &amp;amp;,
* &amp;quot;, &amp;lt; and &amp;gt;.
*
* @param entity
* the entity to be replaced
* @return the substitution for the entity, either itself, the unicode
* equivalent or an empty string.
*/
private static String handleEntity(final String entity) {
if (goodEntities.contains(entity)) {
return entity;
}
final String replace = (String) badEntities.get(entity);
if (replace != null) {
return replace;
}
return replace != null ? replace : "";
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.data.collector.plugins.oaisets;
import java.util.Iterator;
import org.springframework.beans.factory.annotation.Required;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
public class OaiSetsCollectorPlugin extends AbstractCollectorPlugin {
private OaiSetsIteratorFactory oaiSetsIteratorFactory;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
return new Iterable<String>() {
@Override
public Iterator<String> iterator() {
return oaiSetsIteratorFactory.newIterator(baseUrl);
}
};
}
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() {
return oaiSetsIteratorFactory;
}
@Required
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) {
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory;
}
}

View File

@ -0,0 +1,133 @@
package eu.dnetlib.data.collector.plugins.oaisets;
import java.io.StringReader;
import java.util.Iterator;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import com.google.common.collect.Sets;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
public class OaiSetsIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(OaiSetsIterator.class);
private Queue<String> queue = new PriorityBlockingQueue<String>();
private SAXReader reader = new SAXReader();
private String baseUrl;
private String token;
private boolean started;
private HttpConnector httpConnector;
private Set<String> setsAlreadySeen = Sets.newHashSet();
public OaiSetsIterator(final String baseUrl, final HttpConnector httpConnector) {
this.baseUrl = baseUrl;
this.started = false;
this.httpConnector = httpConnector;
}
private void verifyStarted() {
if (!this.started) {
this.started = true;
try {
this.token = firstPage();
} catch (CollectorServiceException e) {
throw new RuntimeException(e);
}
}
}
@Override
public boolean hasNext() {
synchronized (queue) {
verifyStarted();
return !queue.isEmpty();
}
}
@Override
public String next() {
synchronized (queue) {
verifyStarted();
final String res = queue.poll();
while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
try {
token = otherPages(token);
} catch (CollectorServiceException e) {
throw new RuntimeException(e);
}
}
return res;
}
}
@Override
public void remove() {}
private String firstPage() throws CollectorServiceException {
final String url = baseUrl + "?verb=ListSets";
log.info("Start harvesting using url: " + url);
return downloadPage(url);
}
private String otherPages(final String resumptionToken) throws CollectorServiceException {
return downloadPage(baseUrl + "?verb=ListSets&resumptionToken=" + resumptionToken);
}
private String downloadPage(final String url) throws CollectorServiceException {
final String xml = httpConnector.getInputSource(url);
Document doc;
try {
doc = reader.read(new StringReader(xml));
} catch (DocumentException e) {
log.warn("Error parsing xml, I try to clean it: " + xml, e);
final String cleaned = XmlCleaner.cleanAllEntities(xml);
try {
doc = reader.read(new StringReader(cleaned));
} catch (DocumentException e1) {
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
}
}
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
if (errorNode != null) {
final String code = errorNode.valueOf("@code");
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
log.warn("noRecordsMatch for oai call: " + url);
return null;
} else throw new CollectorServiceException(code + " - " + errorNode.getText());
}
boolean sawAllSets = true;
for (Object o : doc.selectNodes("//*[local-name()='ListSets']/*[local-name()='set']")) {
String set = ((Element) o).valueOf("./*[local-name()='setSpec']");
if (!setsAlreadySeen.contains(set)) {
sawAllSets = false;
setsAlreadySeen.add(set);
queue.add(((Node) o).asXML());
}
}
if (sawAllSets) {
log.warn("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin.");
System.out.println("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin.");
return null;
} else return doc.valueOf("//*[local-name()='resumptionToken']");
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.data.collector.plugins.oaisets;
import java.util.Iterator;
import org.springframework.beans.factory.annotation.Required;
import eu.dnetlib.data.collector.plugins.HttpConnector;
public class OaiSetsIteratorFactory {
private HttpConnector httpConnector;
public Iterator<String> newIterator(String baseUrl) {
return new OaiSetsIterator(baseUrl, httpConnector);
}
public HttpConnector getHttpConnector() {
return httpConnector;
}
@Required
public void setHttpConnector(HttpConnector httpConnector) {
this.httpConnector = httpConnector;
}
}

View File

@ -0,0 +1,117 @@
package eu.dnetlib.data.collector.plugins.opentrial;
/**
* Created by miriam on 07/03/2017.
*/
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import org.apache.commons.io.IOUtils;
import java.net.*;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
//import java.util.function.Consumer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.*;
public class OpenTrialIterator implements Iterable<String> {
private final String base_url;
private int total ;
private ArrayBlockingQueue<String> trials = new ArrayBlockingQueue<String>(100);
private int current = 0;
private static final Log log = LogFactory.getLog(OpenTrialIterator.class);
public OpenTrialIterator(String base_url, String from_date, String to_date)throws CollectorServiceException{
try {
String q = "per_page=100";
if (!(from_date == null)) {
if (!(to_date == null)) {
q = "q=registration_date%3A%5B" + from_date + "%20TO%20" + to_date + "%5D&" + q;
} else
q = "q=registration_date%3A%5B" + from_date + "%20TO%20*%5D&" + q;
}
this.base_url = base_url+ q;
log.info("url from which to collect " + this.base_url);
prepare();
}catch(Exception ex){
throw new CollectorServiceException(ex);
}
}
private void prepare()throws Exception {
JSONObject json = new JSONObject(getPage(1));
total = json.getInt("total_count");
log.info("Total number of entries to collect: " + total);
fillTrials(json);
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>(){
private int page_number = 2;
@Override
public void remove(){
}
@Override
public String next() {
try {
if (trials.isEmpty()) {
JSONObject json = new JSONObject(getPage(page_number));
fillTrials(json);
page_number++;
}
return trials.poll();
}catch(Exception ex){
throw new CollectorServiceRuntimeException(ex);
}
}
@Override
public boolean hasNext(){
log.debug("More entries to collect: (" + current + "<" + total + "=" + (current < total));
return (current < total || !trials.isEmpty());
}
};
}
private void fillTrials(JSONObject json)throws CollectorServiceException{
JSONArray entries = json.getJSONArray("items");
for(Object entry: entries) {
try {
trials.put(XML.toString(entry));
}catch(Exception ex){
throw new CollectorServiceException(ex);
}
current++;
}
}
private String getPage(int page_number)throws CollectorServiceException {
try {
URL url = new URL(base_url + "&page=" + page_number);
URLConnection conn = url.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
return (IOUtils.toString(conn.getInputStream()));
}catch(Exception ex){
throw new CollectorServiceException(ex);
}
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.data.collector.plugins.opentrial;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Created by miriam on 07/03/2017.
*/
public class OpenTrialPlugin extends AbstractCollectorPlugin{
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
try {
OpenTrialIterator iterator = new OpenTrialIterator(interfaceDescriptor.getBaseUrl(),fromDate,untilDate);
return iterator;
} catch (Exception e) {
throw new CollectorServiceException("OOOPS something bad happen on creating iterator ", e);
}
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.data.collector.plugins.projects.grist;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Plugin to collect metadata record about projects and fundings via the europePMC GRIST API (e.g. WT projects).
* <p>
* Documentation on GRIST API: http://europepmc.org/GristAPI.
* </p>
* <p>
* BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core
* where resultType=core asks for the complete information (including abstracts).
* The results returned by the API are XMLs.
* </p>
* <p>
* Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end.
* </p>
*
* @author alessia
*/
public class GristCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core
return new GristProjectsIterable(interfaceDescriptor.getBaseUrl());
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.data.collector.plugins.projects.grist;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import eu.dnetlib.enabling.resultset.SizedIterable;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
public class GristProjectsIterable implements SizedIterable<String> {
private static final Log log = LogFactory.getLog(GristProjectsIterable.class); // NOPMD by marko on 11/24/08 5:02 PM
private String queryURL;
private int total;
private SAXReader reader;
public GristProjectsIterable(String baseURL) throws CollectorServiceException {
queryURL = baseURL;
reader = new SAXReader();
total = getTotalCount();
}
@Override
public int getNumberOfElements() {
return total;
}
private int getTotalCount() throws CollectorServiceException {
try {
URL pageUrl = new URL(queryURL);
log.debug("Getting hit count from: " + pageUrl.toString());
String resultPage = IOUtils.toString(pageUrl);
Document doc = reader.read(IOUtils.toInputStream(resultPage));
String hitCount = doc.selectSingleNode("/Response/HitCount").getText();
return Integer.parseInt(hitCount);
} catch (NumberFormatException e) {
log.warn("Cannot set the total count from '/Response/HitCount'");
} catch (DocumentException e) {
throw new CollectorServiceException(e);
} catch (MalformedURLException e) {
throw new CollectorServiceException(e);
} catch (IOException e) {
throw new CollectorServiceException(e);
}
return -1;
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
private Queue<String> projects = new PriorityBlockingQueue<String>();
private boolean morePages = true;
private int pageNumber = 0;
private SAXReader reader = new SAXReader();
//The following is for debug only
private int nextCounter = 0;
@Override
public boolean hasNext() {
try {
fillProjectListIfNeeded();
} catch (CollectorServiceException e) {
throw new CollectorServiceRuntimeException(e);
}
return !projects.isEmpty();
}
@Override
public String next() {
nextCounter++;
log.debug(String.format("Calling next %s times. projects queue has %s elements", nextCounter, projects.size()));
try {
fillProjectListIfNeeded();
return projects.poll();
} catch (CollectorServiceException e) {
throw new CollectorServiceRuntimeException(e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private boolean fillProjectListIfNeeded() throws CollectorServiceException {
if (morePages && projects.isEmpty()) {
String resultPage = getNextPage();
Document doc = null;
try {
doc = reader.read(IOUtils.toInputStream(resultPage));
List<Element> records = doc.selectNodes("//RecordList/Record");
if (records != null && !records.isEmpty()) {
for (Element p : records) {
projects.add(p.asXML());
}
return true;
} else {
log.info("No more projects to read at page nr. " + pageNumber);
morePages = false;
return false;
}
} catch (DocumentException e) {
throw new CollectorServiceException(e);
}
} else return false;
}
private String getNextPage() {
pageNumber++;
try {
URL pageUrl = new URL(queryURL + "&page=" + pageNumber);
log.debug("Getting page at: " + pageUrl.toString());
return IOUtils.toString(pageUrl);
} catch (Exception e) {
throw new CollectorServiceRuntimeException("Error on page " + pageNumber, e);
}
}
};
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.data.collector.plugins.projects.gtr2;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Plugin to collect metadata record about projects and fundings via the RCUK grt2 API.
* <p>
* Documentation : http://gtr.rcuk.ac.uk/resources/api.html.
* </p>
* <p>
* BaseURL: http://gtr.rcuk.ac.uk/gtr/api
* The results returned by the API are XMLs.
* </p>
* <p>
* Pagination: TO BE DEFINED. Exceeding the number of pages available will result in a HTTP response code of 404
* </p>
*
* @author alessia
*/
public class Gtr2CollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
throws CollectorServiceException {
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
return new Gtr2ProjectsIterable(interfaceDescriptor.getBaseUrl(), fromDate);
}
}

View File

@ -0,0 +1,181 @@
package eu.dnetlib.data.collector.plugins.projects.gtr2;
import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
import java.util.concurrent.*;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.data.collector.plugins.HttpConnector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.lang3.*;
public class Gtr2Helper {
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
private VTDNav mainVTDNav;
private AutoPilot mainAutoPilot;
private StringWriter writer;
private HttpConnector connector;
//private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
public String processProject(final VTDNav vn, final String namespaces) throws Exception {
//log.debug("Processing project at "+projectURL);
writer = new StringWriter();
mainVTDNav = vn;
mainAutoPilot = new AutoPilot(mainVTDNav);
writer.write("<doc " + namespaces + ">");
writeFragment(mainVTDNav);
mainAutoPilot.selectXPath("//link[@rel='FUND']");
ExecutorService es = Executors.newFixedThreadPool(5);
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new ProcessFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
new String[] { "<ld-org>", "</ld-org>" }));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
new String[] { "<pp-org>","</pp-org>" }));
es.execute(t);
}
mainAutoPilot.resetXPath();
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
while (mainAutoPilot.evalXPath() != -1) {
Thread t = new Thread(new PiPer(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
es.execute(t);
}
es.shutdown();
log.debug("Waiting threads");
es.awaitTermination(10, TimeUnit.MINUTES);
log.debug("Finished writing project");
writer.write("</doc>");
writer.close();
return writer.toString();
}
private VTDNav setNavigator(final String httpUrl) {
VTDGen vg_tmp = new VTDGen();
connector = new HttpConnector();
try {
byte[] bytes = connector.getInputSource(httpUrl).getBytes("UTF-8");
vg_tmp.setDoc(bytes);
vg_tmp.parse(false);
//vg_tmp.parseHttpUrl(httpUrl, false);
return vg_tmp.getNav();
}catch (Throwable e){
return null;
}
}
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
ap_tmp.selectXPath(xPath);
return ap_tmp.evalXPath();
}
private void writeFragment(final VTDNav nav) throws Exception {
ByteArrayOutputStream b = new ByteArrayOutputStream();
nav.dumpFragment(b);
String ret = b.toString();
b.reset();
writer.write(ret);
}
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) throws Exception {
int nav_res = evalXpath(vn, xPath);
if (nav_res != -1) {
String tmp = xmlOpenTag;
if (attrName != null) tmp += (vn.toNormalizedString(vn.getAttrVal(attrName)));
else
tmp += (StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText())));
tmp += (xmlCloseTag);
writer.write(tmp);
}
}
private class PiPer implements Runnable {
private VTDNav vn;
public PiPer(String httpURL) {
vn = setNavigator(httpURL);
}
@Override
public void run() {
try {
writeFragment(vn);
} catch (Throwable e) {log.debug("Eccezione in PiPer " + e.getMessage());}
}
}
private class Org implements Runnable {
private String[] tags;
private VTDNav vn;
public Org(final String httpURL, final String[] tags) {
vn = setNavigator(httpURL);
this.tags = tags;
}
@Override
public void run() {
try {
writeNewTagAndInfo(vn, "//name", tags[0]+"<name>", "</name>", null);
vn.toElement(VTDNav.ROOT);
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
vn.toElement(VTDNav.ROOT);
writeNewTagAndInfo(vn, ".", "<id>", "</id>"+tags[1], "id");
} catch (Throwable e) {
log.debug("Eccezione in Org " + e.getMessage());
}
}
}
private class ProcessFunder implements Runnable {
private VTDNav vn;
public ProcessFunder(final String httpURL) {
vn = setNavigator(httpURL);
}
@Override
public void run() {
try {
AutoPilot ap = new AutoPilot(vn);
writeFragment(vn);
ap.selectXPath(".//link[@rel='FUNDER']");
VTDNav tmp_vn;
while (ap.evalXPath() != -1) {
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
}
} catch (Throwable e) {log.debug("Eccezione in Funder" + e.getMessage());}
}
}
}

View File

@ -0,0 +1,352 @@
package eu.dnetlib.data.collector.plugins.projects.gtr2;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import eu.dnetlib.enabling.resultset.SizedIterable;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import eu.dnetlib.data.collector.plugins.HttpConnector;
/**
* Created by alessia on 28/11/16.
*/
public class Gtr2ProjectsIterable implements SizedIterable<String> {
public static final String TERMINATOR = "ARNOLD";
public static final int WAIT_END_SECONDS = 120;
public static final int PAGE_SZIE = 20;
private static final Log log = LogFactory.getLog(Gtr2ProjectsIterable.class);
private String queryURL;
private int total = -1;
private int startFromPage = 1;
private int endAtPage;
private VTDGen vg;
private VTDNav vn;
private AutoPilot ap;
private String namespaces;
private boolean incremental = false;
private DateTime fromDate;
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
private ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<String>(20);
//private boolean finished = false;
private final ExecutorService es = Executors.newFixedThreadPool(PAGE_SZIE);
private String nextElement = null;
private HttpConnector connector;
public Gtr2ProjectsIterable(final String baseUrl, final String fromDate) throws CollectorServiceException {
prepare(baseUrl, fromDate);
fillInfo(true);
}
public Gtr2ProjectsIterable(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException {
prepare(baseUrl, fromDate);
this.setStartFromPage(startFromPage);
this.setEndAtPage(endAtPage);
fillInfo(false);
}
private void prepare(final String baseUrl, final String fromDate) {
connector = new HttpConnector();
queryURL = baseUrl + "/projects";
vg = new VTDGen();
this.incremental = StringUtils.isNotBlank(fromDate);
if (incremental) {
// I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
}
}
@Override
public int getNumberOfElements() {
return total;
}
private void fillInfo(final boolean all) throws CollectorServiceException {
try {
// log.debug("Getting hit count from: " + queryURL);
byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8");
vg.setDoc(bytes);
vg.parse(false);
//vg.parseHttpUrl(queryURL, false);
initParser();
String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize"));
String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages"));
namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" ";
namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" ";
namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" ";
namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" ";
namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" ";
namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" ";
if (all) {
setEndAtPage(Integer.parseInt(totalPages));
total = Integer.parseInt(hitCount);
}
Thread ft = new Thread(new FillProjectList());
ft.start();
log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1));
} catch (NumberFormatException e) {
log.error("Cannot set the total count or the number of pages");
throw new CollectorServiceException(e);
} catch (Throwable e) {
throw new CollectorServiceException(e);
}
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
// The following is for debug only
private int nextCounter = 0;
@Override
public boolean hasNext() {
try {
log.debug("hasNext?");
if (nextElement == null) {
nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS);
log.debug("Exit poll :-)");
}
return nextElement != null && !nextElement.equals(TERMINATOR);
} catch (InterruptedException e) {
throw new CollectorServiceRuntimeException(e);
}
}
@Override
public String next() {
nextCounter++;
log.debug(String.format("Calling next %s times.", nextCounter));
if (nextElement == null) throw new NoSuchElementException();
else {
String res = nextElement;
nextElement = null;
return res;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private void initParser() {
vn = vg.getNav();
ap = new AutoPilot(vn);
}
public String getQueryURL() {
return queryURL;
}
public void setQueryURL(final String queryURL) {
this.queryURL = queryURL;
}
public int getTotal() {
return total;
}
public void setTotal(final int total) {
this.total = total;
}
public int getEndAtPage() {
return endAtPage;
}
public void setEndAtPage(final int endAtPage) {
this.endAtPage = endAtPage;
log.debug("Overriding endAtPage to " + endAtPage);
}
public VTDGen getVg() {
return vg;
}
public void setVg(final VTDGen vg) {
this.vg = vg;
}
public VTDNav getVn() {
return vn;
}
public void setVn(final VTDNav vn) {
this.vn = vn;
}
public AutoPilot getAp() {
return ap;
}
public void setAp(final AutoPilot ap) {
this.ap = ap;
}
public String getNamespaces() {
return namespaces;
}
public void setNamespaces(final String namespaces) {
this.namespaces = namespaces;
}
public int getStartFromPage() {
return startFromPage;
}
public void setStartFromPage(final int startFromPage) {
this.startFromPage = startFromPage;
log.debug("Overriding startFromPage to " + startFromPage);
}
private class FillProjectList implements Runnable {
private boolean morePages = true;
private int pageNumber = startFromPage;
@Override
public void run() {
String resultPageUrl = "";
try {
do {
resultPageUrl = getNextPageUrl();
log.debug("Page: " + resultPageUrl);
// clear VGen before processing the next file
vg.clear();
byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8");
vg.setDoc(bytes);
vg.parse(false);
//vg.parseHttpUrl(resultPageUrl, false);
initParser();
ap.selectXPath("//project");
int res;
while ((res = ap.evalXPath()) != -1) {
final String projectHref = vn.toNormalizedString(vn.getAttrVal("href"));
Thread t = new Thread(new ParseProject(projectHref));
t.setName("Thread for " + res);
es.execute(t);
}
ap.resetXPath();
} while (morePages);
es.shutdown();
es.awaitTermination(WAIT_END_SECONDS, TimeUnit.SECONDS);
projects.put(TERMINATOR);
} catch (Throwable e) {
log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage());
}
}
private String getNextPageUrl() {
String url = queryURL + "?p=" + pageNumber;
if (pageNumber == endAtPage) {
morePages = false;
}
pageNumber++;
return url;
}
}
private class ParseProject implements Runnable {
VTDNav vn1;
VTDGen vg1;
private String projectRef;
public ParseProject(final String projectHref) {
projectRef = projectHref;
vg1 = new VTDGen();
try {
byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8");
vg1.setDoc(bytes);
vg1.parse(false);
//vg1.parseHttpUrl(projectRef, false);
vn1 = vg1.getNav();
}catch(Throwable e){
log.error("Exception processing " + projectRef + "\n" + e.getMessage());
}
}
private int projectsUpdate(String attr) throws CollectorServiceException {
try {
int index = vn1.getAttrVal(attr);
if (index != -1) {
String d = vn1.toNormalizedString(index);
DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter);
// updated or created after the last time it was collected
if (recordDate.isAfter(fromDate)) {
log.debug("New project to collect");
return index;
}
return -1;
}
return index;
} catch (Throwable e) {
throw new CollectorServiceException(e);
}
}
private String collectProject() throws CollectorServiceException {
try {
int p = vn1.getAttrVal("href");
final String projectHref = vn1.toNormalizedString(p);
log.debug("collecting project at " + projectHref);
Gtr2Helper gtr2Helper = new Gtr2Helper();
String projectPackage = gtr2Helper.processProject(vn1, namespaces);
return projectPackage;
} catch (Throwable e) {
throw new CollectorServiceException(e);
}
}
private boolean add(String attr) throws CollectorServiceException {
return projectsUpdate(attr) != -1;
}
@Override
public void run() {
log.debug("Getting project info from " + projectRef);
try {
if (!incremental || (incremental && (add("created") || add("updated")))) {
projects.put(collectProject());
log.debug("Project enqueued " + projectRef);
}
} catch (Throwable e) {
log.error("Error on ParseProject " + e.getMessage());
throw new CollectorServiceRuntimeException(e);
}
}
}
}

View File

@ -0,0 +1,59 @@
/**
*
*/
package eu.dnetlib.data.collector.plugins.rest;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.lang3.StringUtils;
/**
* @author js, Andreas Czerniak
*
*/
public class RestCollectorPlugin extends AbstractCollectorPlugin {
@Override
public Iterable<String> collect(InterfaceDescriptor ifDescriptor, String arg1, String arg2)
throws CollectorServiceException {
final String baseUrl = ifDescriptor.getBaseUrl();
final String resumptionType = ifDescriptor.getParams().get("resumptionType");
final String resumptionParam = ifDescriptor.getParams().get("resumptionParam");
final String resumptionXpath = ifDescriptor.getParams().get("resumptionXpath");
final String resultTotalXpath = ifDescriptor.getParams().get("resultTotalXpath");
final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam");
final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue");
final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam");
final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue");
final String queryParams = ifDescriptor.getParams().get("queryParams");
final String entityXpath = ifDescriptor.getParams().get("entityXpath");
if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");}
if (StringUtils.isBlank(resumptionType)) {throw new CollectorServiceException("Param 'resumptionType' is null or empty");}
if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");}
// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");}
// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");}
// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query
//if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");}
if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");}
if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");}
// prevent resumptionType: discover -- if (Integer.valueOf(resultSizeValue) <= 1) {throw new CollectorServiceException("Param 'resultSizeValue' is less than 2");}
if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");}
return () -> new RestIterator(
baseUrl,
resumptionType,
resumptionParam,
resumptionXpath,
resultTotalXpath,
resultFormatParam,
resultFormatValue,
resultSizeParam,
resultSizeValue,
queryParams,
entityXpath);
}
}

View File

@ -0,0 +1,343 @@
/**
* log.debug(...) equal to log.trace(...) in the application-logs
* <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
*/
package eu.dnetlib.data.collector.plugins.rest;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*;
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
/**
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
* @date 2018-09-03
*
*/
public class RestIterator implements Iterator<String> {
// TODO: clean up the comments of replaced source code
private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
private static final String wrapName = "recordWrap";
private String baseUrl;
private String resumptionType;
private String resumptionParam;
private String resultFormatValue;
private String queryParams;
private int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results)
private InputStream resultStream;
private Transformer transformer;
private XPath xpath;
private String query;
private XPathExpression xprResultTotalPath;
private XPathExpression xprResumptionPath;
private XPathExpression xprEntity;
private String queryFormat;
private String querySize;
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
private int discoverResultSize = 0;
private int pagination = 1;
public RestIterator(
final String baseUrl,
final String resumptionType,
final String resumptionParam,
final String resumptionXpath,
final String resultTotalXpath,
final String resultFormatParam,
final String resultFormatValue,
final String resultSizeParam,
final String resultSizeValueStr,
final String queryParams,
final String entityXpath
) {
this.baseUrl = baseUrl;
this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue;
this.queryParams = queryParams;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
}
initQueue();
}
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
throws TransformerConfigurationException, XPathExpressionException {
transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath);
}
private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat;
}
private void disconnect() {
// TODO close inputstream
}
/* (non-Javadoc)
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) {
disconnect();
return false;
} else {
return true;
}
}
/* (non-Javadoc)
* @see java.util.Iterator#next()
*/
@Override
public String next() {
synchronized (recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) {
try {
log.info("get Query: " + query);
query = downloadPage(query);
log.debug("next queryURL from downloadPage(): " + query);
} catch (CollectorServiceException e) {
log.debug("CollectorPlugin.next()-Exception: " + e);
throw new RuntimeException(e);
}
}
return recordQueue.poll();
}
}
/*
* download page and return nextQuery
*/
private String downloadPage(String query) throws CollectorServiceException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
try {
URL qUrl = new URL(query);
resultStream = qUrl.openStream();
if ("json".equals(resultFormatValue.toLowerCase())) {
resultJson = IOUtils.toString(resultStream, "UTF-8");
resultJson = syntaxConvertJsonKeyNamens(resultJson);
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
log.trace("before inputStream: " + resultXml);
resultXml = XmlCleaner.cleanAllEntities(resultXml);
log.trace("after cleaning: " + resultXml);
resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
}
if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: " + nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
recordQueue.add(sw.toString());
}
} else { log.info("resultXml is equal with emptyXml"); }
resumptionInt += resultSizeValue;
switch (resumptionType.toLowerCase()) {
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
resumptionStr = xprResumptionPath.evaluate(resultNode);
break;
case "count": // begin at one step for all records, iterate over items
resumptionStr = Integer.toString(resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
qUrlArgument = qUrl.getQuery();
String[] arrayQUrlArgument = qUrlArgument.split("&");
for (String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
}
}
if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
) {
// resumptionStr = "";
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
resultTotal = discoverResultSize;
} else {
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1;
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
}
log.info("discoverResultSize: " + discoverResultSize);
break;
case "pagination":
case "page": // pagination, iterate over pages
pagination += 1;
if (nodeList != null) {
discoverResultSize += nodeList.getLength();
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
if (resultTotal == -1) {
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; } // to correct the upper bound
log.info("resultTotal was -1 is now: " + resultTotal);
}
log.info("resultTotal: " + resultTotal);
log.info("resInt: " + resumptionInt);
if (resumptionInt < resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
} else
nextQuery = "";
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
} catch (Exception e) {
log.error(e);
throw new IllegalStateException("collection failed: " + e.getMessage());
}
}
/**
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
* and work-around for the JSON to XML converting of org.json.XML-package.
*
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
*
* @param jsonInput
* @return convertedJsonKeynameOutput
*/
private String syntaxConvertJsonKeyNamens(String jsonInput) {
log.trace("before convertJsonKeyNames: " + jsonInput);
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
// replace ' 's in JSON Namens with '_'
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
}
// replace forward-slash (sign '/' ) in JSON Names with '_'
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
}
// replace '(' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
}
// replace ')' in JSON Names with ''
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
}
// replace startNumbers in JSON Keynames with 'n_'
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
}
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
}
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
// }
// replace '=' in JSON Keynames with '-'
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
}
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
return jsonInput;
}
/**
*
* https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
* *
* @param bufferStr - XML string
* @return
*/
private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
}
// replace [#x10-#x1f] with ''
// while (bufferStr.matches(".*&#x1[0-9a-f].*")) {
// bufferStr = bufferStr.replaceAll("&#x1([0-9a-fA-F])", "");
// }
return bufferStr;
}
}

View File

@ -0,0 +1,685 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
public class DatasetDocument {
private List<Identifier> identifiers;
private List<Creator> creators;
private List<String> titles;
private List<String> alternativeTitles;
private List<String> publishers;
private List<LocalDate> publicationDates;
private List<String> subjects;
private List<Contributor> contributors;
private List<LocalDate> createdDates;
private List<LocalDate> updatedDates;
private List<String> languages;
private List<ResourceType> resourceTypes;
private List<AlternateIdentifier> alternateIdentifier;
private List<Citation> citations;
private List<String> sizes;
private List<String> format;
private List<String> version;
private List<License> licenses;
private List<String> descriptions;
private List<String> disambiguatingDescriptions;
private List<SpatialCoverage> geoLocations;
public List<Identifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<Identifier> identifiers) {
this.identifiers = identifiers;
}
public List<Creator> getCreators() {
return creators;
}
public void setCreators(List<Creator> creators) {
this.creators = creators;
}
public List<String> getTitles() {
return titles;
}
public void setTitles(List<String> titles) {
this.titles = titles;
}
public List<String> getAlternativeTitles() {
return alternativeTitles;
}
public void setAlternativeTitles(List<String> alternativeTitles) {
this.alternativeTitles = alternativeTitles;
}
public List<String> getPublishers() {
return publishers;
}
public void setPublishers(List<String> publishers) {
this.publishers = publishers;
}
public List<LocalDate> getPublicationDates() {
return publicationDates;
}
public void setPublicationDates(List<LocalDate> publicationDates) {
this.publicationDates = publicationDates;
}
public List<String> getSubjects() {
return subjects;
}
public void setSubjects(List<String> subjects) {
this.subjects = subjects;
}
public List<Contributor> getContributors() {
return contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
public List<LocalDate> getCreatedDates() {
return createdDates;
}
public void setCreatedDates(List<LocalDate> createdDates) {
this.createdDates = createdDates;
}
public List<LocalDate> getUpdatedDates() {
return updatedDates;
}
public void setUpdatedDates(List<LocalDate> updatedDates) {
this.updatedDates = updatedDates;
}
public List<String> getLanguages() {
return languages;
}
public void setLanguages(List<String> languages) {
this.languages = languages;
}
public List<ResourceType> getResourceTypes() {
return resourceTypes;
}
public void setResourceTypes(List<ResourceType> resourceTypes) {
this.resourceTypes = resourceTypes;
}
public List<AlternateIdentifier> getAlternateIdentifier() {
return alternateIdentifier;
}
public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) {
this.alternateIdentifier = alternateIdentifier;
}
public List<Citation> getCitations() {
return citations;
}
public void setCitations(List<Citation> citations) {
this.citations = citations;
}
public List<String> getSizes() {
return sizes;
}
public void setSizes(List<String> sizes) {
this.sizes = sizes;
}
public List<String> getFormat() {
return format;
}
public void setFormat(List<String> format) {
this.format = format;
}
public List<String> getVersion() {
return version;
}
public void setVersion(List<String> version) {
this.version = version;
}
public List<License> getLicenses() {
return licenses;
}
public void setLicenses(List<License> licenses) {
this.licenses = licenses;
}
public List<String> getDescriptions() {
return descriptions;
}
public void setDescriptions(List<String> descriptions) {
this.descriptions = descriptions;
}
public List<String> getDisambiguatingDescriptions() {
return disambiguatingDescriptions;
}
public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) {
this.disambiguatingDescriptions = disambiguatingDescriptions;
}
public List<SpatialCoverage> getGeoLocations() {
return geoLocations;
}
public void setGeoLocations(List<SpatialCoverage> geoLocations) {
this.geoLocations = geoLocations;
}
private static String emptyXml;
private static Object lockEmptyXml = new Object();
public static String emptyXml() {
if(DatasetDocument.emptyXml!=null) return DatasetDocument.emptyXml;
String xml = null;
try {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
Document doc = docBuilder.newDocument();
Element root = doc.createElement("dataset");
doc.appendChild(root);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
StringWriter writer = new StringWriter();
transformer.transform(new DOMSource(doc), new StreamResult(writer));
xml = writer.getBuffer().toString();
}catch(Exception ex){
xml = "<dataset/>";
}
synchronized (DatasetDocument.lockEmptyXml) {
if (DatasetDocument.emptyXml == null) DatasetDocument.emptyXml = xml;
}
return DatasetDocument.emptyXml;
}
public String toXml() throws Exception {
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
Document doc = docBuilder.newDocument();
Element root = doc.createElement("dataset");
doc.appendChild(root);
if(this.identifiers!=null){
for(Identifier item : this.identifiers){
item.toXml(root);
}
}
if(this.creators!=null){
Element creators = doc.createElement("creators");
root.appendChild(creators);
for(Creator item : this.creators){
item.toXml(creators);
}
}
if(this.titles!=null || this.alternativeTitles!=null){
Element titles = doc.createElement("titles");
root.appendChild(titles);
if(this.titles!=null) {
for (String item : this.titles) {
Element title = doc.createElement("title");
titles.appendChild(title);
title.appendChild(doc.createTextNode(item));
}
}
if(this.alternativeTitles!=null) {
for (String item : this.alternativeTitles) {
Element title = doc.createElement("title");
titles.appendChild(title);
title.setAttribute("titleType", "AlternativeTitle");
title.appendChild(doc.createTextNode(item));
}
}
}
if(this.publishers!=null){
for(String item : this.publishers){
Element publisher = doc.createElement("publisher");
root.appendChild(publisher);
publisher.appendChild(doc.createTextNode(item));
}
}
if(this.publicationDates!=null){
for(LocalDate item : this.publicationDates){
Element publicationYear = doc.createElement("publicationYear");
root.appendChild(publicationYear);
publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear())));
}
}
if(this.subjects!=null){
Element subjects = doc.createElement("subjects");
root.appendChild(subjects);
for(String item : this.subjects){
Element subject = doc.createElement("subject");
subjects.appendChild(subject);
subject.appendChild(doc.createTextNode(item));
}
}
if(this.contributors!=null){
for(Contributor item : this.contributors){
item.toXml(root);
}
}
if(this.createdDates!=null || this.updatedDates!=null){
Element dates = doc.createElement("dates");
root.appendChild(dates);
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD");
if(createdDates!=null) {
for (LocalDate item : this.createdDates) {
Element date = doc.createElement("date");
root.appendChild(date);
date.setAttribute("dateType", "Created");
date.appendChild(doc.createTextNode(item.format(formatter)));
}
}
if(updatedDates!=null) {
for (LocalDate item : this.updatedDates) {
Element date = doc.createElement("date");
root.appendChild(date);
date.setAttribute("dateType", "Updated");
date.appendChild(doc.createTextNode(item.format(formatter)));
}
}
}
if(this.languages!=null){
for(String item : this.languages){
Element language = doc.createElement("language");
root.appendChild(language);
language.appendChild(doc.createTextNode(item));
}
}
if(this.resourceTypes!=null){
for(ResourceType item : this.resourceTypes){
item.toXml(root);
}
}
if(this.alternateIdentifier!=null){
Element alternateIdentifiers = doc.createElement("alternateIdentifiers");
root.appendChild(alternateIdentifiers);
for(AlternateIdentifier item : this.alternateIdentifier){
item.toXml(alternateIdentifiers);
}
}
if(this.citations!=null){
for(Citation item : this.citations){
item.toXml(root);
}
}
if(this.sizes!=null){
Element sizes = doc.createElement("sizes");
root.appendChild(sizes);
for(String item : this.sizes){
Element size = doc.createElement("size");
sizes.appendChild(size);
size.appendChild(doc.createTextNode(item));
}
}
if(this.format!=null){
Element formats = doc.createElement("formats");
root.appendChild(formats);
for(String item : this.format){
Element format = doc.createElement("format");
formats.appendChild(format);
format.appendChild(doc.createTextNode(item));
}
}
if(this.version!=null){
for(String item : this.version){
Element version = doc.createElement("version");
root.appendChild(version);
version.appendChild(doc.createTextNode(item));
}
}
if(this.licenses!=null){
Element rightsList = doc.createElement("rightsList");
root.appendChild(rightsList);
for(License item : this.licenses){
item.toXml(rightsList);
}
}
if(this.descriptions!=null || this.disambiguatingDescriptions!=null){
Element descriptions = doc.createElement("descriptions");
root.appendChild(descriptions);
if(this.descriptions!=null) {
for (String item : this.descriptions) {
Element description = doc.createElement("description");
descriptions.appendChild(description);
description.setAttribute("descriptionType", "Abstract");
description.appendChild(doc.createTextNode(item));
}
}
if(this.disambiguatingDescriptions!=null) {
for (String item : this.disambiguatingDescriptions) {
Element description = doc.createElement("description");
descriptions.appendChild(description);
description.setAttribute("descriptionType", "Other");
description.appendChild(doc.createTextNode(item));
}
}
}
if(this.geoLocations!=null){
Element geoLocations = doc.createElement("geoLocations");
root.appendChild(geoLocations);
for(SpatialCoverage item : this.geoLocations){
item.toXml(geoLocations);
}
}
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
StringWriter writer = new StringWriter();
transformer.transform(new DOMSource(doc), new StreamResult(writer));
String xml = writer.getBuffer().toString();
return xml;
}
public static class SpatialCoverage{
public static class Point{
public String latitude;
public String longitude;
public Point() {}
public Point(String latitude, String longitude){
this.latitude = latitude;
this.longitude = longitude;
}
}
public String name;
public List<Point> points;
public List<String> boxes;
public SpatialCoverage() {}
public SpatialCoverage(String name, List<Point> points, List<String> boxes ) {
this.name = name;
this.points = points;
this.boxes = boxes;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("geoLocation");
parent.appendChild(node);
if(this.points!=null) {
for(Point point : this.points) {
if(point.latitude == null || point.longitude == null) continue;
Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint");
geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude)));
node.appendChild(geoLocationPoint);
}
}
if(this.boxes!=null) {
for(String box : this.boxes) {
if(box == null) continue;
Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox");
geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box));
node.appendChild(geoLocationBox);
}
}
if(this.name!=null) {
Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace");
geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name));
node.appendChild(geoLocationPlace);
}
}
}
public static class License{
public String name;
public String url;
public License() {}
public License(String name, String url) {
this.name = name;
this.url = url;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("rights");
parent.appendChild(node);
if(this.url!=null) {
node.setAttribute("rightsURI", this.url);
}
if(this.name!=null) {
node.appendChild(parent.getOwnerDocument().createTextNode(this.name));
}
}
}
public static class Citation{
public enum CitationIdentifierType{
ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID,
PURL, UPC, URL, URN
}
public CitationIdentifierType type;
public String value;
public Citation() {}
public Citation(String value, CitationIdentifierType type) {
this.value = value;
this.type = type;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("relatedIdentifier");
parent.appendChild(node);
node.setAttribute("relatedIdentifierType", this.type.toString());
node.setAttribute("relationType", "Cites");
node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
}
}
public static class Contributor{
public enum ContributorType{
ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution,
Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority,
RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other
}
public String name;
public List<String> affiliations;
public ContributorType type;
public Contributor() {
}
public Contributor(String name) {
this.name = name;
}
public Contributor(String name, List<String> affiliations) {
this.name = name;
this.affiliations = affiliations;
}
public Contributor(String name, List<String> affiliations, ContributorType type) {
this.name = name;
this.affiliations = affiliations;
this.type = type;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("contributor");
parent.appendChild(node);
node.setAttribute("contributorType", this.type.toString());
if(this.name!=null) {
Element contributorName = parent.getOwnerDocument().createElement("contributorName");
node.appendChild(contributorName);
contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
}
if(this.affiliations!=null) {
for(String item : this.affiliations) {
Element affiliation = parent.getOwnerDocument().createElement("affiliation");
node.appendChild(affiliation);
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
}
}
}
}
public static class AlternateIdentifier{
public String identifier;
public String type;
public AlternateIdentifier() {}
public AlternateIdentifier(String identifier, String type) {
this.identifier = identifier;
this.type = type;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("alternateIdentifier");
parent.appendChild(node);
if(this.type!=null) {
node.setAttribute("alternateIdentifierType", this.type);
}
if(this.identifier!=null) {
node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier));
}
}
}
public static class ResourceType{
public enum ResourceTypeGeneralType {
Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service,
Software, Sound, Text, Workflow, Other
}
public ResourceTypeGeneralType type;
public ResourceType() {}
public ResourceType(ResourceTypeGeneralType type) {
this.type = type;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("resourceType");
parent.appendChild(node);
if(this.type!=null) {
node.setAttribute("resourceTypeGeneral", this.type.toString());
}
}
}
public static class Creator {
public String name;
public List<String> affiliations;
public Creator() {
}
public Creator(String name) {
this.name = name;
}
public Creator(String name, List<String> affiliations) {
this.name = name;
this.affiliations = affiliations;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("creator");
parent.appendChild(node);
if(this.name!=null) {
Element creatorName = parent.getOwnerDocument().createElement("creatorName");
node.appendChild(creatorName);
creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
}
if(this.affiliations!=null) {
for(String item : this.affiliations) {
Element affiliation = parent.getOwnerDocument().createElement("affiliation");
node.appendChild(affiliation);
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
}
}
}
}
public static class Identifier {
public enum IdentifierType {
ARK, DOI, Handle, PURL, URN, URL
}
public String value;
public IdentifierType type;
public Identifier() {
}
public Identifier(IdentifierType type, String value) {
this.type = type;
this.value = value;
}
public void toXml(Element parent){
Element node = parent.getOwnerDocument().createElement("identifier");
parent.appendChild(node);
node.setAttribute("identifierType", this.type.toString());
if(this.value!=null) {
node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
}
}
}
}

View File

@ -0,0 +1,514 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import java.net.URL;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.*;
public class DatasetMappingIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
public static class Options {
public static class IdentifierOptions{
public List<String> mappingARK;
public List<String> mappingDOI;
public List<String> mappingHandle;
public List<String> mappingPURL;
public List<String> mappingURN;
public List<String> mappingURL;
public DatasetDocument.Identifier.IdentifierType fallbackType;
public Boolean fallbackURL;
}
public static class ContributorOptions{
public DatasetDocument.Contributor.ContributorType fallbackType;
}
public static class PublicationDateOptions{
public String format;
}
public static class CreatedDateOptions{
public String format;
}
public static class UpdatedDateOptions{
public String format;
}
private IdentifierOptions identifierOptions;
private PublicationDateOptions publicationDateOptions;
private ContributorOptions contributorOptions;
private CreatedDateOptions createdDateOptions;
private UpdatedDateOptions updatedDateOptions;
public UpdatedDateOptions getUpdatedDateOptions() {
return updatedDateOptions;
}
public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
this.updatedDateOptions = updatedDateOptions;
}
public CreatedDateOptions getCreatedDateOptions() {
return createdDateOptions;
}
public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
this.createdDateOptions = createdDateOptions;
}
public ContributorOptions getContributorOptions() {
return contributorOptions;
}
public void setContributorOptions(ContributorOptions contributorOptions) {
this.contributorOptions = contributorOptions;
}
public PublicationDateOptions getPublicationDateOptions() {
return publicationDateOptions;
}
public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
this.publicationDateOptions = publicationDateOptions;
}
public IdentifierOptions getIdentifierOptions() {
return identifierOptions;
}
public void setIdentifierOptions(IdentifierOptions identifierOptions) {
this.identifierOptions = identifierOptions;
}
}
private Options options;
private EndpointAccessIterator endpointAccessIterator;
public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
this.options = options;
this.endpointAccessIterator = endpointAccessIterator;
}
@Override
public boolean hasNext() {
return this.endpointAccessIterator.hasNext();
}
@Override
public String next() {
JSONObject document = this.endpointAccessIterator.next();
String xml = null;
if (document == null) {
log.debug("no document provided to process. returning empty");
xml = DatasetDocument.emptyXml();
}
else {
log.debug("building document");
xml = this.buildDataset(document);
if (!Utils.validateXml(xml)) {
log.debug("xml not valid. setting to empty");
xml = null;
}
if (xml == null) {
log.debug("could not build xml. returning empty");
xml = DatasetDocument.emptyXml();
}
}
//if all else fails
if(xml == null){
log.debug("could not build xml. returning empty");
xml = "<dataset/>";
}
log.debug("xml document for dataset is: "+xml);
return xml;
}
private String buildDataset(JSONObject document){
String xml = null;
try{
DatasetDocument dataset = new DatasetDocument();
dataset.setIdentifiers(this.extractIdentifier(document));
dataset.setCreators(this.extractCreator(document));
dataset.setTitles(this.extractTitles(document));
dataset.setAlternativeTitles(this.extractAlternateTitles(document));
dataset.setPublishers(this.extractPublisher(document));
dataset.setPublicationDates(this.extractPublicationDate(document));
dataset.setSubjects(this.extractSubjects(document));
dataset.setContributors(this.extractContributors(document));
dataset.setCreatedDates(this.extractCreatedDate(document));
dataset.setUpdatedDates(this.extractUpdatedDate(document));
dataset.setLanguages(this.extractLanguages(document));
dataset.setResourceTypes(this.extractResourceTypes(document));
dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
dataset.setCitations(this.extractCitations(document));
dataset.setSizes(this.extractSize(document));
dataset.setFormat(this.extractEncodingFormat(document));
dataset.setVersion(this.extractVersion(document));
dataset.setLicenses(this.extractLicense(document));
dataset.setDescriptions(this.extractDescription(document));
dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
dataset.setGeoLocations(this.extractSpatialCoverage(document));
log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
this.options.getIdentifierOptions().fallbackURL){
log.debug("falling back to url identifier");
dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
}
xml = dataset.toXml();
}
catch(Exception ex){
log.error("problem constructing dataset xml. returning empty", ex);
xml = null;
}
return xml;
}
private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
List<String> urls = JSONLDUtils.extractString(document, "url");
ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
for(String item : urls){
if(item == null || item.trim().length() == 0) continue;
curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL, item.trim()));
}
return curated;
}
private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
for(JSONLDUtils.PlaceInfo item : spatials){
if((item.name == null || item.name.trim().length() == 0) &&
(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
List<String> boxes = new ArrayList<>();
if(item.geoCoordinates!=null) {
for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
}
}
if(item.geoShapes!=null) {
for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
boxes.add(iter.box);
}
}
curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
}
return curated;
}
private List<String> extractDescription(JSONObject document){
List<String> descriptions = JSONLDUtils.extractString(document, "description");
ArrayList<String> curated = new ArrayList<>();
for(String item : descriptions){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return curated;
}
private List<String> extractDisambiguatingDescription(JSONObject document){
List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
ArrayList<String> curated = new ArrayList<>();
for(String item : descriptions){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return curated;
}
private List<DatasetDocument.License> extractLicense(JSONObject document){
List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
ArrayList<DatasetDocument.License> curated = new ArrayList<>();
for(JSONLDUtils.LicenseInfo item : licenses){
if(item.url == null || item.url.trim().length() == 0) continue;
curated.add(new DatasetDocument.License(item.name, item.url));
}
return curated;
}
private List<String> extractVersion(JSONObject document){
List<String> versions = JSONLDUtils.extractString(document, "version");
ArrayList<String> curated = new ArrayList<>();
for(String item : versions){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return curated;
}
private List<String> extractSize(JSONObject document) {
List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
HashSet<String> curated = new HashSet<>();
for (String item : sizes) {
if (item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return new ArrayList<>(curated);
}
private List<String> extractEncodingFormat(JSONObject document){
List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
HashSet<String> curated = new HashSet<>();
for(String item : formats){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return new ArrayList<>(curated);
}
//TODO: Handle different citation types. Currently only urls
private List<DatasetDocument.Citation> extractCitations(JSONObject document){
List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
for(JSONLDUtils.CitationInfo item : citations){
if(item.url == null || item.url.trim().length() == 0) continue;
try{
new URL(item.url);
}catch (Exception ex){
continue;
}
curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
}
return curated;
}
private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
List<String> issns = JSONLDUtils.extractString(document, "issn");
List<String> urls = JSONLDUtils.extractString(document, "url");
ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
for(String item : issns){
if(item == null || item.trim().length() == 0) continue;
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
}
for(String item : urls){
if(item == null || item.trim().length() == 0) continue;
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
}
return curated;
}
private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
return resourceTypes;
}
private List<String> extractLanguages(JSONObject document){
List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
ArrayList<String> curated = new ArrayList<>();
for(String item : languages){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return curated;
}
private List<LocalDate> extractUpdatedDate(JSONObject document){
List<LocalDate> updatedDates = new ArrayList<>();
if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
List<String> dates = JSONLDUtils.extractString(document, "dateModified");
for(String updatedDate : dates){
if(updatedDate == null || updatedDate.trim().length() == 0) continue;
try {
LocalDate localDate = LocalDate.parse(updatedDate, formatter);
updatedDates.add(localDate);
} catch (Exception e) {
continue;
}
}
return updatedDates;
}
private List<LocalDate> extractCreatedDate(JSONObject document){
List<LocalDate> createdDates = new ArrayList<>();
if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
for(String createdDate : dates){
if(createdDate == null || createdDate.trim().length() == 0) continue;
try {
LocalDate localDate = LocalDate.parse(createdDate, formatter);
createdDates.add(localDate);
} catch (Exception e) {
continue;
}
}
return createdDates;
}
private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
for(JSONLDUtils.PrincipalInfo item : editors){
if(item.name() == null || item.name().trim().length() == 0) continue;
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
}
for(JSONLDUtils.PrincipalInfo item : funders){
if(item.name() == null || item.name().trim().length() == 0) continue;
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
}
for(JSONLDUtils.PrincipalInfo item : producers){
if(item.name() == null || item.name().trim().length() == 0) continue;
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
}
for(JSONLDUtils.PrincipalInfo item : sponsors){
if(item.name() == null || item.name().trim().length() == 0) continue;
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
}
for(JSONLDUtils.PrincipalInfo item : constributors){
if(item.name() == null || item.name().trim().length() == 0) continue;
DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
}
return curated;
}
private List<String> extractSubjects(JSONObject document){
List<String> subjects = JSONLDUtils.extractString(document, "keywords");
ArrayList<String> curated = new ArrayList<>();
for(String item : subjects){
if(item == null || item.trim().length() == 0) continue;
curated.add(item);
}
return curated;
}
private List<LocalDate> extractPublicationDate(JSONObject document){
List<LocalDate> publicationDates = new ArrayList<>();
if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
List<String> dates = JSONLDUtils.extractString(document, "datePublished");
for(String publicationDate : dates){
if(publicationDate == null || publicationDate.trim().length() == 0) continue;
try {
LocalDate localDate = LocalDate.parse(publicationDate, formatter);
publicationDates.add(localDate);
} catch (Exception e) {
continue;
}
}
return publicationDates;
}
private List<String> extractPublisher(JSONObject document){
List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
ArrayList<String> curated = new ArrayList<>();
for(JSONLDUtils.PrincipalInfo item : publishers){
if(item.name() == null || item.name().trim().length() == 0) continue;
curated.add(item.name());
}
return curated;
}
private List<String> extractTitles(JSONObject document){
List<String> names = JSONLDUtils.extractString(document, "name");
List<String> headlines = JSONLDUtils.extractString(document, "headline");
HashSet<String> titles = new HashSet<>();
titles.addAll(names);
titles.addAll(headlines);
return new ArrayList<>(titles);
}
private List<String> extractAlternateTitles(JSONObject document){
List<String> names = JSONLDUtils.extractString(document, "alternateName");
List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
HashSet<String> titles = new HashSet<>();
titles.addAll(names);
titles.addAll(headlines);
return new ArrayList<>(titles);
}
private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
List<DatasetDocument.Identifier> curated = new ArrayList<>();
List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
for(JSONLDUtils.IdentifierInfo item : identifiers){
if(item.value == null || item.value.trim().length() == 0) continue;
if(item.type == null || item.type.trim().length() == 0) {
if (this.options.getIdentifierOptions().fallbackType == null) continue;
curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
}
else {
DatasetDocument.Identifier.IdentifierType type = null;
if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
if(type == null) continue;
curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
}
}
return curated;
}
private List<DatasetDocument.Creator> extractCreator(JSONObject document){
List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
HashSet<String> foundNames = new HashSet<>();
List<DatasetDocument.Creator> curated = new ArrayList<>();
for(JSONLDUtils.PrincipalInfo item : creators){
if(item.name() == null || item.name().trim().length() == 0) continue;
if(foundNames.contains(item.name())) continue;
foundNames.add(item.name());
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
}
for(JSONLDUtils.PrincipalInfo item : authors){
if(item.name() == null || item.name().trim().length() == 0) continue;
if(foundNames.contains(item.name())) continue;
foundNames.add(item.name());
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
}
return curated;
}
}

View File

@ -0,0 +1,106 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
public class EndpointAccessIterator implements Iterator<JSONObject> {
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
public static class Options {
private Charset charset;
public Options(){}
public Options(Charset charset) {
this.charset = charset;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
}
private Options options;
private Iterator<String> repositoryIterator;
public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
this.options = options;
this.repositoryIterator = repositoryIterator;
}
@Override
public boolean hasNext() {
return this.repositoryIterator.hasNext();
}
@Override
public JSONObject next() {
String endpoint = this.repositoryIterator.next();
if(endpoint == null) return null;
log.debug(String.format("processing: %s", endpoint));
JSONObject dataset = this.extractDatasetRecord(endpoint);
return dataset;
}
private JSONObject extractDatasetRecord(String endpoint) {
JSONObject datasetDocument = null;
try {
URL urlEndpoint = new URL(endpoint);
log.debug("downloading endpoint "+urlEndpoint);
String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset());
log.trace("downloaded payload id: "+payload);
Document doc = Jsoup.parse(payload);
Elements scriptTags = doc.getElementsByTag("script");
for (Element scriptTag : scriptTags) {
if (!scriptTag.hasAttr("type")) continue;
String scriptType = scriptTag.attr("type");
if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
String data = scriptTag.data();
JSONObject schemaItem = new JSONObject(data);
String context = schemaItem.optString("@context");
String type = schemaItem.optString("@type");
if (context == null || type == null) continue;
Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
Boolean isDataset = type.equalsIgnoreCase("dataset");
if (!isSchemaOrgContext || !isDataset) continue;
log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
datasetDocument = schemaItem;
break;
}
}catch(Exception ex){
log.error("problem extracting dataset document. returning empty", ex);
datasetDocument = null;
}
if(datasetDocument == null){
log.debug("did not find any dataset document in endpoint");
}
else{
log.debug("found dataset document in endpoint :"+datasetDocument.toString());
}
return datasetDocument;
}
}

View File

@ -0,0 +1,515 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.ArrayList;
import java.util.List;
public class JSONLDUtils {
public interface PrincipalInfo{
String name();
List<String> affiliationNames();
}
public static class OrganizationInfo implements PrincipalInfo{
public String name;
public String name(){return this.name;}
public List<String> affiliationNames(){
return null;
}
public OrganizationInfo(){}
public OrganizationInfo(String name){
this.name = name;
}
}
public static class PersonInfo implements PrincipalInfo{
public String name;
public List<OrganizationInfo> affiliations;
public String name(){return this.name;}
public List<String> affiliationNames(){
if(this.affiliations == null) return null;
List<String> curated = new ArrayList<>();
for(OrganizationInfo item : this.affiliations){
if(item == null || item.name == null || item.name.trim().length() == 0) continue;;
curated.add(item.name.trim());
}
return curated;
}
public PersonInfo(){}
public PersonInfo(String name){
this.name = name;
}
public PersonInfo(String name, List<OrganizationInfo> affiliations){
this.name = name;
this.affiliations = affiliations;
}
}
public static class LicenseInfo{
public String name;
public String url;
public LicenseInfo(){}
public LicenseInfo(String url){
this.url = url;
}
public LicenseInfo(String url, String name){
this.name = name;
this.url = url;
}
}
public static class CitationInfo{
public String url;
public CitationInfo(){}
public CitationInfo(String url){
this.url = url;
}
}
public static class IdentifierInfo{
public String value;
public String type;
public IdentifierInfo(){}
public IdentifierInfo(String value){
this.value = value;
}
public IdentifierInfo(String value, String type){
this.value = value;
this.type = type;
}
}
public static class GeoCoordinatesInfo{
public String latitude;
public String longitude;
public GeoCoordinatesInfo(){}
public GeoCoordinatesInfo(String latitude, String longitude){
this.latitude = latitude;
this.longitude = longitude;
}
}
public static class GeoShapeInfo{
public String box;
public GeoShapeInfo(){}
public GeoShapeInfo(String box){
this.box = box;
}
}
public static class PlaceInfo{
public String name;
public List<GeoCoordinatesInfo> geoCoordinates;
public List<GeoShapeInfo> geoShapes;
public PlaceInfo(){}
public PlaceInfo(String name, List<GeoCoordinatesInfo> geoCoordinates, List<GeoShapeInfo> geoShapes){
this.name = name;
this.geoCoordinates = geoCoordinates;
this.geoShapes = geoShapes;
}
}
private static PlaceInfo extractPlaceSingle(JSONObject document){
if(document == null || !"Place".equals(document.optString("@type"))) return null;
String name = document.optString("name");
List<GeoCoordinatesInfo> geoCoordinates = JSONLDUtils.extractGeoCoordinates(document, "geo");
List<GeoShapeInfo> geoShapes = JSONLDUtils.extractGeoShapes(document, "geo");
if((name==null || name.trim().length() == 0) &&
(geoCoordinates == null || geoCoordinates.size() == 0) &&
(geoShapes == null || geoShapes.size() == 0)) return null;
return new PlaceInfo(name, geoCoordinates, geoShapes);
}
public static List<PlaceInfo> extractPlaces(JSONObject document, String key) {
List<PlaceInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
PlaceInfo nfo = JSONLDUtils.extractPlaceSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
PlaceInfo nfo = JSONLDUtils.extractPlaceSingle(obj);
if(nfo!=null) items.add(nfo);
}
return items;
}
private static GeoCoordinatesInfo extractGeoCoordinatesSingle(JSONObject document){
if(document == null || !"GeoCoordinates".equals(document.optString("@type"))) return null;
String latitude = document.optString("latitude");
String longitude = document.optString("longitude");
if(latitude==null || latitude.trim().length()==0 || longitude==null || longitude.trim().length()==0) return null;
return new GeoCoordinatesInfo(latitude, longitude);
}
private static List<GeoCoordinatesInfo> extractGeoCoordinates(JSONObject document, String key) {
List<GeoCoordinatesInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
GeoCoordinatesInfo nfo = JSONLDUtils.extractGeoCoordinatesSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
GeoCoordinatesInfo nfo = JSONLDUtils.extractGeoCoordinatesSingle(obj);
if(nfo!=null) items.add(nfo);
}
return items;
}
private static GeoShapeInfo extractGeoShapeSingle(JSONObject document){
if(document == null || !"GeoShape".equals(document.optString("@type"))) return null;
String box = document.optString("box");
if(box==null || box.trim().length()==0 ) return null;
return new GeoShapeInfo(box);
}
private static List<GeoShapeInfo> extractGeoShapes(JSONObject document, String key) {
List<GeoShapeInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
GeoShapeInfo nfo = JSONLDUtils.extractGeoShapeSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
GeoShapeInfo nfo = JSONLDUtils.extractGeoShapeSingle(obj);
if(nfo!=null) items.add(nfo);
}
return items;
}
private static OrganizationInfo extractOrganizationSingle(JSONObject document){
if(document == null || !"Organization".equals(document.optString("@type"))) return null;
String name = document.optString("name");
if(name==null || name.trim().length()==0) return null;
return new OrganizationInfo(name);
}
private static List<OrganizationInfo> extractOrganization(JSONObject document, String key) {
List<OrganizationInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
OrganizationInfo nfo = JSONLDUtils.extractOrganizationSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
OrganizationInfo nfo = JSONLDUtils.extractOrganizationSingle(obj);
if(nfo!=null) items.add(nfo);
}
return items;
}
private static PersonInfo extractPersonSingle(JSONObject document) {
if(document == null || !"Person".equals(document.optString("@type"))) return null;
String name = document.optString("name");
String givenName = document.optString("givenName");
String familyName = document.optString("familyName");
if ((name == null || name.trim().length() == 0) && (givenName!=null || familyName !=null)) {
if(givenName !=null && familyName!=null) name = String.join(" ", familyName, givenName).trim();
else if (givenName == null) name = familyName;
else if (familyName == null) name = givenName;
}
if(name==null || name.trim().length()==0) return null;
List<OrganizationInfo> affiliations = JSONLDUtils.extractOrganization(document, "affiliation");
return new PersonInfo(name, affiliations);
}
private static List<PersonInfo> extractPerson(JSONObject document, String key) {
List<PersonInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
PersonInfo nfo = JSONLDUtils.extractPersonSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
PersonInfo nfo = JSONLDUtils.extractPersonSingle(obj);
if(nfo!=null) items.add(nfo);
} else {
String value = document.optString(key);
if (value != null) items.add(new PersonInfo(value));
}
return items;
}
public static PrincipalInfo extractPrincipalSingle(JSONObject document) {
PrincipalInfo principal = JSONLDUtils.extractPersonSingle(document);
if(principal == null) principal = JSONLDUtils.extractOrganizationSingle(document);
return principal;
}
public static List<PrincipalInfo> extractPrincipal(JSONObject document, String key) {
List<PrincipalInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
PrincipalInfo nfo = JSONLDUtils.extractPrincipalSingle(array.optJSONObject(i));
if(nfo!=null) items.add(nfo);
}
}else if (obj!=null) {
PrincipalInfo nfo = JSONLDUtils.extractPrincipalSingle(obj);
if(nfo!=null) items.add(nfo);
} else {
String value = document.optString(key);
if (value != null) items.add(new PersonInfo(value));
}
return items;
}
public static List<String> extractString(JSONObject document, String key){
List<String> items = new ArrayList<>();
if (!document.has(key)) return items;
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if(item != null) continue;
String value = array.optString(i);
if(value == null) continue;
items.add(value);
}
} else if (obj == null) {
String value = document.optString(key);
if(value != null) items.add(value);
}
return items;
}
public static List<String> extractSize(JSONObject document, String key){
List<String> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if (item == null || !"DataDownload".equals((item.optString("@type")))) continue;
String size = item.optString("contentSize");
if (size != null) items.add(size);
}
} else if (obj != null) {
String size = obj.optString("contentSize");
if ("DataDownload".equals((obj.optString("@type"))) && size != null) {
items.add(size);
}
}
return items;
}
public static List<String> extractEncodingFormat(JSONObject document, String key){
List<String> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if (item == null || !"DataDownload".equals((item.optString("@type")))) continue;
String encodingFormat = item.optString("encodingFormat");
if (encodingFormat != null) items.add(encodingFormat);
String fileFormat = item.optString("fileFormat");
if (fileFormat != null) items.add(fileFormat);
}
} else if (obj != null) {
if ("DataDownload".equals((obj.optString("@type")))) {
String encodingFormat = obj.optString("encodingFormat");
if (encodingFormat != null) items.add(encodingFormat);
String fileFormat = obj.optString("fileFormat");
if (fileFormat != null) items.add(fileFormat);
}
}
return items;
}
public static List<String> extractLanguage(JSONObject document, String key){
List<String> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if (item == null) {
String value = array.optString(i);
if (value != null) items.add(value);
} else {
if (!"Language".equals((item.optString("@type")))) continue;
String name = item.optString("name");
if (name != null) items.add(name);
String alternateName = item.optString("alternateName");
if (alternateName != null) items.add(alternateName);
}
}
} else if (obj != null) {
if ("Language".equals((obj.optString("@type")))){
String name = obj.optString("name");
if (name != null) items.add(name);
String alternateName = obj.optString("alternateName");
if (alternateName != null) items.add(alternateName);
}
} else {
String value = document.optString(key);
if (value != null) items.add(value);
}
return items;
}
public static List<LicenseInfo> extractLicenses(JSONObject document, String key){
List<LicenseInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if (item == null) {
String value = array.optString(i);
if(value != null) items.add(new LicenseInfo(value));
} else {
if (!"CreativeWork".equals((item.optString("@type")))) continue;
String url = item.optString("url");
String name = item.optString("name");
if (url != null || name != null) items.add(new LicenseInfo(url, name));
}
}
} else if (obj != null) {
if("CreativeWork".equals((obj.optString("@type")))) {
String url = obj.optString("url");
String name = obj.optString("name");
if (url != null || name != null) items.add(new LicenseInfo(url, name));
}
} else {
String value = document.optString(key);
if (value != null) items.add(new LicenseInfo(value));
}
return items;
}
public static List<CitationInfo> extractCitations(JSONObject document, String key){
List<CitationInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
JSONObject item = array.optJSONObject(i);
if (item == null) {
String value = array.optString(i);
if(value != null) items.add(new CitationInfo(value));
} else {
if (!"CreativeWork".equals((item.optString("@type")))) continue;
String url = item.optString("url");
if (url != null) items.add(new CitationInfo(url));
}
}
} else if (obj != null) {
if("CreativeWork".equals((obj.optString("@type")))) {
String url = obj.optString("url");
if (url != null) items.add(new CitationInfo(url));
}
} else {
String value = document.optString(key);
if (value != null) items.add(new CitationInfo(value));
}
return items;
}
private static IdentifierInfo extractIdentifierSingle(JSONObject document){
if(document == null || !"PropertyValue".equals(document.optString("@type"))) return null;
String name = document.optString("name");
String value = document.optString("value");
if(value==null || value.trim().length()==0) return null;
return new IdentifierInfo(value, name);
}
public static List<IdentifierInfo> extractIdentifier(JSONObject document, String key) {
List<IdentifierInfo> items = new ArrayList<>();
JSONArray array = document.optJSONArray(key);
JSONObject obj = document.optJSONObject(key);
if (array != null) {
for (int i = 0; i < array.length(); i += 1) {
IdentifierInfo nfo = null;
if (array.optJSONObject(i) == null) {
String value = array.optString(i);
if (value != null) nfo = new IdentifierInfo(value);
}
if (nfo == null) nfo = JSONLDUtils.extractIdentifierSingle(array.optJSONObject(i));
if (nfo != null) items.add(nfo);
}
}else if (obj!=null) {
IdentifierInfo nfo = JSONLDUtils.extractIdentifierSingle(obj);
if (nfo != null) items.add(nfo);
} else {
String value = document.optString(key);
if (value != null) items.add(new IdentifierInfo(value));
}
return items;
}
}

View File

@ -0,0 +1,7 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import java.util.Iterator;
public interface RepositoryIterable extends Iterable<String> {
public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a";
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
public class RepositoryQueueIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(RepositoryQueueIterator.class);
public static class Options {
private Boolean blockPolling;
private long pollTimeout;
private TimeUnit pollTimeoutUnit;
public Boolean getBlockPolling() {
return blockPolling;
}
public void setBlockPolling(Boolean blockPolling) {
this.blockPolling = blockPolling;
}
public long getPollTimeout() {
return pollTimeout;
}
public void setPollTimeout(long pollTimeout) {
this.pollTimeout = pollTimeout;
}
public TimeUnit getPollTimeoutUnit() {
return pollTimeoutUnit;
}
public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) {
this.pollTimeoutUnit = pollTimeoutUnit;
}
}
private ArrayBlockingQueue<String> queue;
private Options options;
private boolean hasTerminated;
public RepositoryQueueIterator(Options options, ArrayBlockingQueue<String> queue) {
this.options = options;
this.queue = queue;
this.hasTerminated = false;
}
@Override
public boolean hasNext() {
if(this.hasTerminated) return false;
return true;
}
@Override
public String next() {
String next = this.poll();
log.debug("next endpoint to process: " + next);
if (next != null && next.equalsIgnoreCase(RepositoryIterable.TerminationHint)) {
log.debug("no more endpoints to process");
this.hasTerminated = true;
next = null;
}
return next;
}
private String poll(){
String item = null;
log.debug("retrieving endpoint from queue");
log.debug("queue size: " + queue.size());
if(this.options.getBlockPolling()) {
try {
item = this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit());
} catch (InterruptedException ex) {
log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit()));
throw new NoSuchElementException(ex.getMessage());
}
}
else {
item = this.queue.poll();
}
log.debug("retrieved endpoint from queue");
log.debug("queue size: " + queue.size());
return item;
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
public class SchemaOrgIterable implements Iterable<String> {
private static final Log log = LogFactory.getLog(SchemaOrgIterable.class);
public static class Options {
private EndpointAccessIterator.Options endpointAccessOptions;
private DatasetMappingIterator.Options datasetMappingOptions;
public EndpointAccessIterator.Options getEndpointAccessOptions() {
return endpointAccessOptions;
}
public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) {
this.endpointAccessOptions = endpointAccessOptions;
}
public DatasetMappingIterator.Options getDatasetMappingOptions() {
return datasetMappingOptions;
}
public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) {
this.datasetMappingOptions = datasetMappingOptions;
}
}
private Options options;
private RepositoryIterable repository;
public SchemaOrgIterable(Options options, RepositoryIterable repository){
this.options = options;
this.repository = repository;
}
@Override
public Iterator<String> iterator() {
Iterator<String> repositoryIterator = this.repository.iterator();
EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator);
DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator);
return datasetMappingIterator;
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.concurrent.TimeUnit;
public class SchemaOrgMainKaggle {
private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
public static void main(String[] args) throws Exception {
ConsoleAppender console = new ConsoleAppender();
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
console.setThreshold(Level.DEBUG);
console.activateOptions();
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
HashMap<String,String> params = new HashMap<>();
params.put("consumerBlockPolling", Boolean.toString(true));
params.put("consumerBlockPollingTimeout", "2");
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
params.put("endpointCharset", StandardCharsets.UTF_8.name());
params.put("updatedDateFormat", "YYYY-MM-DD");
params.put("createdDateFormat", "YYYY-MM-DD");
params.put("publicationDateFormat", "YYYY-MM-DD");
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
params.put("identifierFallbackURL", Boolean.toString(true));
params.put("identifierMappingARK", "ark, ARK");
params.put("identifierMappingDOI", "doi, DOI");
params.put("identifierMappingHandle", "Handle, HANDLE");
params.put("identifierMappingPURL", "purl, PURL");
params.put("identifierMappingURN", "urn, URN");
params.put("identifierMappingURL", "url, URL");
params.put("repositoryAccessType", "httpapi-kaggle");
params.put("httpapi-kaggle_queueSize", "100");
params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
params.put("httpapi-kaggle_producerBlockPollingTimeout", "2");
params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
InterfaceDescriptor descriptor = new InterfaceDescriptor();
descriptor.setId("schema.org - kaggle");
descriptor.setBaseUrl("https://www.kaggle.com");
descriptor.setParams(params);
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
String outDir = params.get("repositoryAccessType");
log.info("saving content in " + outDir);
File directory = new File(outDir);
if (directory.exists()) {
log.info(directory.getAbsolutePath() + " exists, cleaning up");
FileUtils.deleteDirectory(directory);
}
FileUtils.forceMkdir(directory);
Utils.writeFiles(iterable, outDir);
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.concurrent.TimeUnit;
public class SchemaOrgMainReactome {
private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
public static void main(String[] args) throws Exception {
ConsoleAppender console = new ConsoleAppender();
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
console.setThreshold(Level.DEBUG);
console.activateOptions();
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
HashMap<String,String> params = new HashMap<>();
params.put("consumerBlockPolling", Boolean.toString(true));
params.put("consumerBlockPollingTimeout", "2");
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
params.put("endpointCharset", StandardCharsets.UTF_8.name());
params.put("updatedDateFormat", "YYYY-MM-DD");
params.put("createdDateFormat", "YYYY-MM-DD");
params.put("publicationDateFormat", "YYYY-MM-DD");
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
params.put("identifierFallbackURL", Boolean.toString(true));
params.put("identifierMappingARK", "ark, ARK");
params.put("identifierMappingDOI", "doi, DOI");
params.put("identifierMappingHandle", "Handle, HANDLE");
params.put("identifierMappingPURL", "purl, PURL");
params.put("identifierMappingURN", "urn, URN");
params.put("identifierMappingURL", "url, URL");
params.put("repositoryAccessType", "sitemapindex");
params.put("sitemap_queueSize", "100");
params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
params.put("sitemap_producerBlockPollingTimeout", "2");
params.put("sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
InterfaceDescriptor descriptor = new InterfaceDescriptor();
descriptor.setId("schema.org - reactome");
descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
descriptor.setParams(params);
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
String outDir = params.get("repositoryAccessType");
log.info("saving content in " + outDir);
File directory = new File(outDir);
if (directory.exists()) {
log.info(directory.getAbsolutePath() + " exists, cleaning up");
FileUtils.deleteDirectory(directory);
}
FileUtils.forceMkdir(directory);
Utils.writeFiles(iterable, outDir);
}
}

View File

@ -0,0 +1,153 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
public String hello(){
return "hello";
}
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
try {
RepositoryIterable repository = null;
String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
switch(repositoryAccessType) {
case "sitemapindex": {
SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
repositoryIterable.bootstrap();
repository = repositoryIterable;
break;
}
case "httpapi-kaggle": {
KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
repositoryIterable.bootstrap();
repository = repositoryIterable;
break;
}
default:
throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
}
SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
return iterable;
} catch (Exception e) {
throw new CollectorServiceException("Could not create iterator", e);
}
}
private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20));
kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
return kaggleRepositoryOptions;
}
private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
return sitemapIndexIteratorOptions;
}
private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
return sitemapFileIteratorOptions;
}
private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
return repositoryQueueIteratorOptions;
}
private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20));
sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
return sitemapIndexRepositoryIterableOptions;
}
private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
return endpointAccessIteratorOptions;
}
private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
return datasetMappingIteratorOptions;
}
private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
return schemaOrgIterableOptions;
}
}

View File

@ -0,0 +1,208 @@
package eu.dnetlib.data.collector.plugins.schemaorg;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.zip.GZIPInputStream;
public class Utils {
private static final Log log = LogFactory.getLog(Utils.class);
public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new InputSource(new StringReader(xml)));
return Utils.collectAsStrings(doc, xpath);
}
public static List<String> collectAsStrings(File file, String xpath) throws Exception{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(file);
return Utils.collectAsStrings(doc, xpath);
}
public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
XPathFactory xPathfactory = XPathFactory.newInstance();
XPath path = xPathfactory.newXPath();
XPathExpression expr = path.compile(xpath);
NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
List<String> values = new ArrayList<>();
for (int i = 0; i < nodes.getLength(); i++)
values.add(nodes.item(i).getNodeValue());
return values;
}
public static void decompressGZipTo(File input, File output) throws Exception {
try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
try (FileOutputStream out = new FileOutputStream(output)){
byte[] buffer = new byte[1024];
int len;
while((len = in.read(buffer)) != -1){
out.write(buffer, 0, len);
}
}
}
}
public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
return value;
}
public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
String[] splits = value.split(",");
List<String> curated = new ArrayList<>();
for(String item : splits){
if(item == null || item.trim().length() == 0) continue;
curated.add(item.trim());
}
return curated;
}
public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Integer.parseInt(value);
} catch (NumberFormatException e) {
return defaultValue;
}
}
public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Long.parseLong(value);
} catch (NumberFormatException e) {
return defaultValue;
}
}
public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
EnumSet<E> values = EnumSet.allOf(clazz);
String value = map.get(key);
if (value == null) return defaultValue;
for(E val : values){
if(!val.name().equalsIgnoreCase(value)) continue;
return val;
}
return defaultValue;
}
public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
String value = map.get(key);
if (value == null) return defaultValue;
return Boolean.parseBoolean(value);
}
public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
{
String value = map.get(key);
if(value == null) return defaultValue;
try {
return Charset.forName(value);
} catch (UnsupportedCharsetException e) {
return defaultValue;
}
}
public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException {
int retry =0;
while(retry < retryCount) {
try {
return IOUtils.toString(endpoint, charset);
} catch (Exception ex) {
retry += 1;
if (retry < retryCount) {
log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds");
try {
Thread.sleep(waitBetweenRetriesMillis);
} catch (Exception e) {
}
}
else{
log.debug("problem accessing url " + endpoint + ". throwing");
throw ex;
}
}
}
return null;
}
public static Boolean validateXml(String xml){
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
InputSource is = new InputSource(new StringReader(xml));
builder.parse(is);
return true;
}catch(Exception ex){
return false;
}
}
public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
int skipped = 0;
int count = 0;
for(String item : iterable) {
final org.dom4j.Document doc = new SAXReader().read(new StringReader(item));
if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) {
log.info(item);
String fileName = outDir + "/" + count++;
try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) {
w.write(item);
}
log.info("wrote " + fileName);
} else {
skipped++;
}
if (skipped % 100 == 0) {
log.info("skipped so far " + skipped);
}
if (count % 100 == 0) {
log.info("stored so far " + count);
}
}
log.info(String.format("Done! skipped %s, stored %s", skipped, count));
}
}

View File

@ -0,0 +1,6 @@
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
public interface HttpApiRepositoryIterable extends RepositoryIterable {
}

View File

@ -0,0 +1,208 @@
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONArray;
import org.json.JSONObject;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
public static class Options {
private String queryUrl;
private String queryPagePlaceholder;
private Charset charset;
private String responsePropertyTotalDataset;
private String responsePropertyDatasetList;
private String responsePropertyDatasetUrl;
private String responseBaseDatasetUrl;
private long putTimeout;
private TimeUnit putTimeoutUnit;
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
private int queueSize;
public long getPutTimeout() {
return putTimeout;
}
public void setPutTimeout(long putTimeout) {
this.putTimeout = putTimeout;
}
public TimeUnit getPutTimeoutUnit() {
return putTimeoutUnit;
}
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
this.putTimeoutUnit = putTimeoutUnit;
}
public int getQueueSize() {
return queueSize;
}
public void setQueueSize(int queueSize) {
this.queueSize = queueSize;
}
public String getResponseBaseDatasetUrl() {
return responseBaseDatasetUrl;
}
public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
this.responseBaseDatasetUrl = responseBaseDatasetUrl;
}
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
return repositoryQueueIteratorOptions;
}
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
}
public String getResponsePropertyDatasetUrl() {
return responsePropertyDatasetUrl;
}
public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
}
public String getResponsePropertyDatasetList() {
return responsePropertyDatasetList;
}
public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
this.responsePropertyDatasetList = responsePropertyDatasetList;
}
public String getResponsePropertyTotalDataset() {
return responsePropertyTotalDataset;
}
public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
this.responsePropertyTotalDataset = responsePropertyTotalDataset;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
public String getQueryPagePlaceholder() {
return queryPagePlaceholder;
}
public void setQueryPagePlaceholder(String queryPagePlaceholder) {
this.queryPagePlaceholder = queryPagePlaceholder;
}
public String getQueryUrl() {
return queryUrl;
}
public void setQueryUrl(String queryUrl) {
this.queryUrl = queryUrl;
}
}
private Options options;
private ArrayBlockingQueue<String> queue;
public KaggleRepositoryIterable(Options options) {
this.options = options;
// this.currentPage = 1;
// this.terminated = false;
}
public void bootstrap() {
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
Thread ft = new Thread(new Harvester() );
ft.start();
// ExecutorService executor = Executors.newSingleThreadExecutor();
// executor.execute(new Harvester());
// executor.shutdown();
}
@Override
public Iterator<String> iterator() {
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
}
private class Harvester implements Runnable{
@Override
public void run() {
this.execute();
}
private void execute() {
try {
int currentPage = 1;
int totalDatasets = 0;
int readDatasets = 0;
while (true) {
String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage));
String response = IOUtils.toString(new URL(query), options.getCharset());
currentPage += 1;
JSONObject pageObject = new JSONObject(response);
totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset());
JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList());
if (datasets == null || datasets.length() == 0) break;
readDatasets += datasets.length();
for (int i = 0; i < datasets.length(); i += 1) {
JSONObject item = datasets.optJSONObject(i);
String urlFragment = item.optString(options.getResponsePropertyDatasetUrl());
if (urlFragment == null || urlFragment.trim().length() == 0) continue;
String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment);
log.debug("adding endpoint in queue");
log.debug("queue size: " + queue.size());
try {
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (InterruptedException ex) {
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
break;
}
log.debug("endpoint added in queue");
log.debug("queue size: " + queue.size());
}
if (readDatasets >= totalDatasets) break;
}
} catch (Exception ex) {
log.error("problem execution harvesting", ex);
} finally {
try {
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (Exception ex) {
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
}
}
}
}
}

View File

@ -0,0 +1,172 @@
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.*;
public class SitemapFileIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(SitemapFileIterator.class);
public static class Options {
public enum SitemapFileType{
Text,
GZ
}
public enum SitemapSchemaType{
Text,
Xml
}
public Options(){}
public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) {
this.fileUrl = fileUrl;
this.charset = charset;
this.schemaType = schemaType;
this.fileType = fileType;
}
private SitemapFileType fileType;
private SitemapSchemaType schemaType;
private URL fileUrl;
private Charset charset;
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
public URL getFileUrl() {
return fileUrl;
}
public void setFileUrl(URL fileUrl) {
this.fileUrl = fileUrl;
}
public SitemapFileType getFileType() {
return fileType;
}
public void setFileType(SitemapFileType fileType) {
this.fileType = fileType;
}
public SitemapSchemaType getSchemaType() {
return schemaType;
}
public void setSchemaType(SitemapSchemaType schemaType) {
this.schemaType = schemaType;
}
@Override
public Object clone(){
Options clone = new Options();
clone.setCharset(this.getCharset());
clone.setFileType(this.getFileType());
clone.setFileUrl(this.getFileUrl());
clone.setSchemaType(this.getSchemaType());
return clone;
}
}
private Options options;
private File downloadedFile;
private File contentFile;
private Queue<String> locations;
public SitemapFileIterator(Options options){
this.options = options;
}
public void bootstrap() {
LinkedList<String> endpoints = null;
try {
log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl()));
this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
this.downloadedFile.deleteOnExit();
FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile);
log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length()));
switch (this.options.getFileType()) {
case Text: {
this.contentFile = this.downloadedFile;
break;
}
case GZ: {
this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
this.contentFile.deleteOnExit();
Utils.decompressGZipTo(this.downloadedFile, this.contentFile);
log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length()));
break;
}
default:
throw new CollectorServiceException("unrecognized file type " + this.options.getFileType());
}
List<String> content = this.collectContentLocations();
log.debug(String.format("extracted %d sitemapindex endpoints", content.size()));
endpoints = new LinkedList<>(content);
}catch(Exception ex){
log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex);
endpoints = new LinkedList<>();
}finally {
if (this.contentFile != null) {
this.contentFile.delete();
}
if (this.downloadedFile != null) {
this.downloadedFile.delete();
}
}
this.locations = endpoints;
}
private List<String> collectContentLocations() throws Exception{
switch(this.options.getSchemaType()) {
case Text:{
return this.collectTextContentLocations();
}
case Xml:{
return this.collectXmlContentLocations();
}
default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType());
}
}
private List<String> collectTextContentLocations() throws Exception {
log.debug(String.format("reading endpoint locations from text sitemapindex"));
try (FileInputStream in = new FileInputStream(this.contentFile)) {
return IOUtils.readLines(in, this.options.getCharset());
}
}
private List<String> collectXmlContentLocations() throws Exception {
log.debug(String.format("reading endpoint locations from xml sitemapindex"));
return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()");
}
@Override
public boolean hasNext() {
return !this.locations.isEmpty();
}
@Override
public String next() {
return this.locations.poll();
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.*;
public class SitemapIndexIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(SitemapIndexIterator.class);
public static class Options {
private URL indexUrl;
private Charset charset;
public Options(){}
public Options(URL indexUrl, Charset charset){
this.indexUrl = indexUrl;
this.charset = charset;
}
public URL getIndexUrl() {
return indexUrl;
}
public void setIndexUrl(URL indexUrl) {
this.indexUrl = indexUrl;
}
public Charset getCharset() {
return charset;
}
public void setCharset(Charset charset) {
this.charset = charset;
}
}
private Options options;
private Queue<String> sitemapFiles;
public SitemapIndexIterator(Options options) {
this.options = options;
}
public void bootstrap() {
List<String> files = null;
try {
log.debug("bootstrapping sitemapindex index access");
String sitemapIndexPayload = Utils.RemoteAccessWithRetry(3, 5000, this.options.getIndexUrl(), this.options.getCharset());
log.debug(String.format("sitemapindex payload is: %s", sitemapIndexPayload));
files = Utils.collectAsStrings(sitemapIndexPayload, "/sitemapindex/sitemap/loc/text()");
log.debug(String.format("extracted %d sitemapindex files", files.size()));
}catch(Exception ex){
log.error("problem bootstrapping sitemapindex index access. returning 0 files", ex);
files = new ArrayList<>();
}
this.sitemapFiles = new PriorityQueue<String>(files);
}
@Override
public boolean hasNext() {
return !this.sitemapFiles.isEmpty();
}
@Override
public String next() {
return this.sitemapFiles.poll();
}
}

View File

@ -0,0 +1,147 @@
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.net.URL;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class SitemapIndexRepositoryIterable implements RepositoryIterable {
private static final Log log = LogFactory.getLog(SitemapIndexRepositoryIterable.class);
public static class Options {
private SitemapIndexIterator.Options sitemapIndexIteratorOptions;
private SitemapFileIterator.Options sitemapFileIteratorOptions;
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
private long putTimeout;
private TimeUnit putTimeoutUnit;
private int queueSize;
public long getPutTimeout() {
return putTimeout;
}
public void setPutTimeout(long putTimeout) {
this.putTimeout = putTimeout;
}
public TimeUnit getPutTimeoutUnit() {
return putTimeoutUnit;
}
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
this.putTimeoutUnit = putTimeoutUnit;
}
public int getQueueSize() {
return queueSize;
}
public void setQueueSize(int queueSize) {
this.queueSize = queueSize;
}
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
return repositoryQueueIteratorOptions;
}
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
}
public SitemapIndexIterator.Options getSitemapIndexIteratorOptions() {
return sitemapIndexIteratorOptions;
}
public void setSitemapIndexIteratorOptions(SitemapIndexIterator.Options sitemapIndexIteratorOptions) {
this.sitemapIndexIteratorOptions = sitemapIndexIteratorOptions;
}
public SitemapFileIterator.Options getSitemapFileIteratorOptions() {
return sitemapFileIteratorOptions;
}
public void setSitemapFileIteratorOptions(SitemapFileIterator.Options sitemapFileIteratorOptions) {
this.sitemapFileIteratorOptions = sitemapFileIteratorOptions;
}
}
private Options options;
private ArrayBlockingQueue<String> queue;
public SitemapIndexRepositoryIterable(Options options) {
this.options = options;
}
public void bootstrap() {
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
Thread ft = new Thread(new Harvester() );
ft.start();
// ExecutorService executor = Executors.newSingleThreadExecutor();
// executor.execute(new Harvester());
// executor.shutdown();
}
@Override
public Iterator<String> iterator() {
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
}
private class Harvester implements Runnable{
@Override
public void run() {
this.execute();
}
private void execute(){
try {
SitemapIndexIterator sitemapIndexIterator = new SitemapIndexIterator(options.getSitemapIndexIteratorOptions());
sitemapIndexIterator.bootstrap();
while (sitemapIndexIterator.hasNext()) {
String sitemapFile = sitemapIndexIterator.next();
if(sitemapFile == null) continue;
SitemapFileIterator.Options sitemapFileIteratorOptions = (SitemapFileIterator.Options)options.getSitemapFileIteratorOptions().clone();
sitemapFileIteratorOptions.setFileUrl(new URL(sitemapFile));
SitemapFileIterator sitemapFileIterator = new SitemapFileIterator(sitemapFileIteratorOptions);
sitemapFileIterator.bootstrap();
while(sitemapFileIterator.hasNext()){
String endpoint = sitemapFileIterator.next();
if(endpoint == null) continue;;
log.debug("adding endpoint in queue");
log.debug("queue size: " + queue.size());
try {
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (InterruptedException ex) {
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
break;
}
log.debug("endpoint added in queue");
log.debug("queue size: " + queue.size());
}
}
}catch(Exception ex){
log.error("problem execution harvesting", ex);
}
finally {
try {
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
} catch (Exception ex) {
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
}
}
}
}
}

View File

@ -0,0 +1,71 @@
package eu.dnetlib.data.collector.plugins.sftp;
import java.util.Iterator;
import java.util.Set;
import com.google.common.base.Splitter;
import com.google.common.collect.Sets;
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
/**
* Created by andrea on 11/01/16.
*/
public class SftpCollectorPlugin extends AbstractCollectorPlugin {
private SftpIteratorFactory sftpIteratorFactory;
@Override
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String toDate)
throws CollectorServiceException {
final String baseUrl = interfaceDescriptor.getBaseUrl();
final String username = interfaceDescriptor.getParams().get("username");
final String password = interfaceDescriptor.getParams().get("password");
final String recursive = interfaceDescriptor.getParams().get("recursive");
final String extensions = interfaceDescriptor.getParams().get("extensions");
if ((baseUrl == null) || baseUrl.isEmpty()) {
throw new CollectorServiceException("Param 'baseurl' is null or empty");
}
if ((username == null) || username.isEmpty()) {
throw new CollectorServiceException("Param 'username' is null or empty");
}
if ((password == null) || password.isEmpty()) {
throw new CollectorServiceException("Param 'password' is null or empty");
}
if ((recursive == null) || recursive.isEmpty()) {
throw new CollectorServiceException("Param 'recursive' is null or empty");
}
if ((extensions == null) || extensions.isEmpty()) {
throw new CollectorServiceException("Param 'extensions' is null or empty");
}
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
// final int fromDateIntSeconds =
return new Iterable<String>() {
boolean isRecursive = "true".equals(recursive);
Set<String> extensionsSet = parseSet(extensions);
@Override
public Iterator<String> iterator() {
return getSftpIteratorFactory().newIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
}
private Set<String> parseSet(final String extensions) {
return Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(extensions));
}
};
}
public SftpIteratorFactory getSftpIteratorFactory() {
return sftpIteratorFactory;
}
public void setSftpIteratorFactory(SftpIteratorFactory sftpIteratorFactory) {
this.sftpIteratorFactory = sftpIteratorFactory;
}
}

View File

@ -0,0 +1,206 @@
package eu.dnetlib.data.collector.plugins.sftp;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
import com.jcraft.jsch.*;
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
* Created by andrea on 11/01/16.
*/
public class SftpIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(SftpIterator.class);
private static final int MAX_RETRIES = 5;
private static final int DEFAULT_TIMEOUT = 30000;
private static final long BACKOFF_MILLIS = 10000;
private String baseUrl;
private String sftpURIScheme;
private String sftpServerAddress;
private String remoteSftpBasePath;
private String username;
private String password;
private boolean isRecursive;
private Set<String> extensionsSet;
private boolean incremental;
private Session sftpSession;
private ChannelSftp sftpChannel;
private Queue<String> queue;
private DateTime fromDate = null;
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
public SftpIterator(String baseUrl, String username, String password, boolean isRecursive, Set<String> extensionsSet, String fromDate) {
this.baseUrl = baseUrl;
this.username = username;
this.password = password;
this.isRecursive = isRecursive;
this.extensionsSet = extensionsSet;
this.incremental = StringUtils.isNotBlank(fromDate);
if (incremental) {
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
}
try {
URI sftpServer = new URI(baseUrl);
this.sftpURIScheme = sftpServer.getScheme();
this.sftpServerAddress = sftpServer.getHost();
this.remoteSftpBasePath = sftpServer.getPath();
} catch (URISyntaxException e) {
throw new CollectorServiceRuntimeException("Bad syntax in the URL " + baseUrl);
}
connectToSftpServer();
initializeQueue();
}
private void connectToSftpServer() {
JSch jsch = new JSch();
try {
JSch.setConfig("StrictHostKeyChecking", "no");
sftpSession = jsch.getSession(username, sftpServerAddress);
sftpSession.setPassword(password);
sftpSession.connect();
Channel channel = sftpSession.openChannel(sftpURIScheme);
channel.connect();
sftpChannel = (ChannelSftp) channel;
String pwd = sftpChannel.pwd();
log.debug("PWD from server: " + pwd);
String fullPath = pwd + remoteSftpBasePath;
sftpChannel.cd(fullPath);
log.debug("PWD from server 2 after 'cd " + fullPath + "' : " + sftpChannel.pwd());
log.info("Connected to SFTP server " + sftpServerAddress);
} catch (JSchException e) {
throw new CollectorServiceRuntimeException("Unable to connect to remote SFTP server.", e);
} catch (SftpException e) {
throw new CollectorServiceRuntimeException("Unable to access the base remote path on the SFTP server.", e);
}
}
private void disconnectFromSftpServer() {
sftpChannel.exit();
sftpSession.disconnect();
}
private void initializeQueue() {
queue = new LinkedList<String>();
log.info(String.format("SFTP collector plugin collecting from %s with recursion = %s, incremental = %s with fromDate=%s", remoteSftpBasePath,
isRecursive,
incremental, fromDate));
listDirectoryRecursive(".", "");
}
private void listDirectoryRecursive(final String parentDir, final String currentDir) {
String dirToList = parentDir;
if (StringUtils.isNotBlank(currentDir)) {
dirToList += "/" + currentDir;
}
log.debug("PARENT DIR: " + parentDir);
log.debug("DIR TO LIST: " + dirToList);
try {
Vector<ChannelSftp.LsEntry> ls = sftpChannel.ls(dirToList);
for (ChannelSftp.LsEntry entry : ls) {
String currentFileName = entry.getFilename();
if (currentFileName.equals(".") || currentFileName.equals("..")) {
// skip parent directory and directory itself
continue;
}
SftpATTRS attrs = entry.getAttrs();
if (attrs.isDir()) {
if (isRecursive) {
listDirectoryRecursive(dirToList, currentFileName);
}
} else {
// test the file for extensions compliance and, just in case, add it to the list.
for (String ext : extensionsSet) {
if (currentFileName.endsWith(ext)) {
//test if the file has been changed after the last collection date:
if (incremental) {
int mTime = attrs.getMTime();
//int times are values reduced by the milliseconds, hence we multiply per 1000L
DateTime dt = new DateTime(mTime * 1000L);
if (dt.isAfter(fromDate)) {
queue.add(currentFileName);
log.debug(currentFileName + " has changed and must be re-collected");
} else {
if (log.isDebugEnabled()) {
log.debug(currentFileName + " has not changed since last collection");
}
}
} else {
//if it is not incremental, just add it to the queue
queue.add(currentFileName);
}
}
}
}
}
} catch (SftpException e) {
throw new CollectorServiceRuntimeException("Cannot list the sftp remote directory", e);
}
}
@Override
public boolean hasNext() {
if (queue.isEmpty()) {
disconnectFromSftpServer();
return false;
} else {
return true;
}
}
@Override
public String next() {
String nextRemotePath = queue.remove();
int nRepeat = 0;
String fullPathFile = nextRemotePath;
while (nRepeat < MAX_RETRIES) {
try {
OutputStream baos = new ByteArrayOutputStream();
sftpChannel.get(nextRemotePath, baos);
if (log.isDebugEnabled()) {
fullPathFile = sftpChannel.pwd() + "/" + nextRemotePath;
log.debug(String.format("Collected file from SFTP: %s%s", sftpServerAddress, fullPathFile));
}
return baos.toString();
} catch (SftpException e) {
nRepeat++;
log.warn(String.format("An error occurred [%s] for %s%s, retrying.. [retried %s time(s)]", e.getMessage(), sftpServerAddress, fullPathFile,
nRepeat));
// disconnectFromSftpServer();
try {
Thread.sleep(BACKOFF_MILLIS);
} catch (InterruptedException e1) {
log.error(e1);
}
}
}
throw new CollectorServiceRuntimeException(
String.format("Impossible to retrieve FTP file %s after %s retries. Aborting FTP collection.", fullPathFile, nRepeat));
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,18 @@
package eu.dnetlib.data.collector.plugins.sftp;
import java.util.Iterator;
import java.util.Set;
/**
* Created by andrea on 11/01/16.
*/
public class SftpIteratorFactory {
public Iterator<String> newIterator(final String baseUrl,
final String username,
final String password,
final boolean isRecursive,
final Set<String> extensionsSet, final String fromDate) {
return new SftpIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
}
}

Some files were not shown because too many files have changed in this diff Show More