imported dnet-modular-collector-service-rmi in dnet-core-components, imported dnet-modular-collector-service (and plugins) in dnet-data-services
This commit is contained in:
parent
1c192fbfee
commit
7acac5986a
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.data.collector.functions;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
|
||||
|
||||
public interface ParamValuesFunction {
|
||||
|
||||
List<ProtocolParameterValue> findValues(String baseUrl, Map<String, String> params);
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.data.collector.plugin;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameter;
|
||||
|
||||
public abstract class AbstractCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private ProtocolDescriptor protocolDescriptor;
|
||||
|
||||
@Override
|
||||
public final String getProtocol() {
|
||||
return getProtocolDescriptor().getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final List<String> listNameParameters() {
|
||||
return Lists.newArrayList(Lists.transform(getProtocolDescriptor().getParams(), new Function<ProtocolParameter, String>() {
|
||||
|
||||
@Override
|
||||
public String apply(final ProtocolParameter p) {
|
||||
return p.getName();
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final ProtocolDescriptor getProtocolDescriptor() {
|
||||
return protocolDescriptor;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setProtocolDescriptor(final ProtocolDescriptor protocolDescriptor) {
|
||||
this.protocolDescriptor = protocolDescriptor;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package eu.dnetlib.data.collector.plugin;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
|
||||
|
||||
public interface CollectorPlugin {
|
||||
|
||||
Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String fromDate, String untilDate) throws CollectorServiceException;
|
||||
|
||||
ProtocolDescriptor getProtocolDescriptor();
|
||||
|
||||
String getProtocol();
|
||||
|
||||
List<String> listNameParameters();
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.data.collector.plugin;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
||||
|
||||
private static final long serialVersionUID = -6925786561303289704L;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String log = new String();
|
||||
int index = 0;
|
||||
for (String errorMessage : this) {
|
||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
||||
}
|
||||
return log;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.jws.WebParam;
|
||||
import javax.jws.WebService;
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReference;
|
||||
|
||||
import eu.dnetlib.common.rmi.BaseService;
|
||||
|
||||
@WebService(targetNamespace = "http://services.dnetlib.eu/")
|
||||
public interface CollectorService extends BaseService {
|
||||
|
||||
W3CEndpointReference collect(@WebParam(name = "interface") final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException;
|
||||
|
||||
W3CEndpointReference dateRangeCollect(
|
||||
@WebParam(name = "interface") final InterfaceDescriptor interfaceDescriptor,
|
||||
@WebParam(name = "from") final String from,
|
||||
@WebParam(name = "until") final String until) throws CollectorServiceException;
|
||||
|
||||
List<ProtocolDescriptor> listProtocols();
|
||||
|
||||
List<ProtocolParameterValue> listValidValuesForParam(
|
||||
@WebParam(name = "protocol") String protocol,
|
||||
@WebParam(name = "baseUrl") String baseUrl,
|
||||
@WebParam(name = "param") String param,
|
||||
@WebParam(name = "otherParams") Map<String, String> otherParams) throws CollectorServiceException;
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import eu.dnetlib.common.rmi.RMIException;
|
||||
|
||||
public class CollectorServiceException extends RMIException {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 7523999812098059764L;
|
||||
|
||||
public CollectorServiceException(String string) {
|
||||
super(string);
|
||||
}
|
||||
|
||||
|
||||
public CollectorServiceException(String string, Throwable exception) {
|
||||
super(string, exception);
|
||||
}
|
||||
|
||||
public CollectorServiceException(Throwable exception) {
|
||||
super(exception);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
public class CollectorServiceRuntimeException extends RuntimeException {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 6317717870955037359L;
|
||||
|
||||
public CollectorServiceRuntimeException(final String string) {
|
||||
super(string);
|
||||
}
|
||||
|
||||
public CollectorServiceRuntimeException(final String string, final Throwable exception) {
|
||||
super(string, exception);
|
||||
}
|
||||
|
||||
public CollectorServiceRuntimeException(final Throwable exception) {
|
||||
super(exception);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
|
||||
import org.dom4j.Node;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
@XmlRootElement
|
||||
public class InterfaceDescriptor {
|
||||
|
||||
private String id;
|
||||
|
||||
private String baseUrl;
|
||||
|
||||
private String protocol;
|
||||
|
||||
private HashMap<String, String> params = Maps.newHashMap();
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(final String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public HashMap<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final HashMap<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public String getProtocol() {
|
||||
return protocol;
|
||||
}
|
||||
|
||||
public void setProtocol(final String protocol) {
|
||||
this.protocol = protocol;
|
||||
}
|
||||
|
||||
public static InterfaceDescriptor newInstance(final Node node) {
|
||||
final InterfaceDescriptor ifc = new InterfaceDescriptor();
|
||||
ifc.setId(node.valueOf("./@id"));
|
||||
ifc.setBaseUrl(node.valueOf("./BASE_URL"));
|
||||
ifc.setProtocol(node.valueOf("./ACCESS_PROTOCOL"));
|
||||
|
||||
for (Object o : node.selectNodes("./ACCESS_PROTOCOL/@*")) {
|
||||
final Node n = (Node) o;
|
||||
ifc.getParams().put(n.getName(), n.getText());
|
||||
}
|
||||
|
||||
return ifc;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
@XmlRootElement
|
||||
public class ProtocolDescriptor {
|
||||
|
||||
private String name;
|
||||
private List<ProtocolParameter> params = new ArrayList<ProtocolParameter>();
|
||||
|
||||
public ProtocolDescriptor() {}
|
||||
|
||||
public ProtocolDescriptor(final String name, final List<ProtocolParameter> params) {
|
||||
this.name = name;
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public List<ProtocolParameter> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final List<ProtocolParameter> params) {
|
||||
this.params = params;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
import javax.xml.bind.annotation.XmlTransient;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.data.collector.functions.ParamValuesFunction;
|
||||
|
||||
@XmlRootElement
|
||||
public class ProtocolParameter {
|
||||
|
||||
private String name;
|
||||
private boolean optional = false;
|
||||
private ProtocolParameterType type = ProtocolParameterType.TEXT;
|
||||
private String regex = null;
|
||||
private transient ParamValuesFunction populateFunction = null;
|
||||
private boolean functionPopulated = false;
|
||||
|
||||
public ProtocolParameter() {}
|
||||
|
||||
public ProtocolParameter(final String name, final boolean optional, final ProtocolParameterType type, final String regex) {
|
||||
this(name, optional, type, regex, null);
|
||||
}
|
||||
|
||||
public ProtocolParameter(final String name, final boolean optional, final ProtocolParameterType type, final String regex,
|
||||
final ParamValuesFunction populateFunction) {
|
||||
this.name = name;
|
||||
this.optional = optional;
|
||||
this.type = type;
|
||||
this.regex = regex;
|
||||
this.populateFunction = populateFunction;
|
||||
this.functionPopulated = this.populateFunction != null;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public boolean isOptional() {
|
||||
return optional;
|
||||
}
|
||||
|
||||
public void setOptional(final boolean optional) {
|
||||
this.optional = optional;
|
||||
}
|
||||
|
||||
public ProtocolParameterType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final ProtocolParameterType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getRegex() {
|
||||
return regex;
|
||||
}
|
||||
|
||||
public void setRegex(final String regex) {
|
||||
this.regex = regex;
|
||||
}
|
||||
|
||||
@XmlTransient
|
||||
public ParamValuesFunction getPopulateFunction() {
|
||||
return populateFunction;
|
||||
}
|
||||
|
||||
public void setPopulateFunction(final ParamValuesFunction populateFunction) {
|
||||
this.populateFunction = populateFunction;
|
||||
this.functionPopulated = this.populateFunction != null;
|
||||
}
|
||||
|
||||
public boolean isFunctionPopulated() {
|
||||
return functionPopulated;
|
||||
}
|
||||
|
||||
public void setFunctionPopulated(final boolean functionPopulated) {
|
||||
this.functionPopulated = functionPopulated;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import javax.xml.bind.annotation.XmlEnum;
|
||||
|
||||
@XmlEnum
|
||||
public enum ProtocolParameterType {
|
||||
TEXT, NUMBER, LIST, BOOLEAN
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.data.collector.rmi;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
|
||||
@XmlRootElement
|
||||
public class ProtocolParameterValue {
|
||||
|
||||
private String id;
|
||||
private String name;
|
||||
|
||||
public ProtocolParameterValue() {}
|
||||
|
||||
public ProtocolParameterValue(final String id, final String name) {
|
||||
this.id = id;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
}
|
|
@ -23,11 +23,42 @@
|
|||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.json</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ximpleware</groupId>
|
||||
<artifactId>vtd-xml</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jcraft</groupId>
|
||||
<artifactId>jsch</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-net</groupId>
|
||||
<artifactId>commons-net</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-csv</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
package eu.dnetlib.data.collector;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.springframework.beans.BeansException;
|
||||
import org.springframework.beans.factory.BeanFactory;
|
||||
import org.springframework.beans.factory.BeanFactoryAware;
|
||||
import org.springframework.beans.factory.ListableBeanFactory;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
public class CollectorPluginEnumerator implements BeanFactoryAware {
|
||||
|
||||
// private static final Log log = LogFactory.getLog(CollectorPluginEnumerator.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
/**
|
||||
* bean factory.
|
||||
*/
|
||||
private ListableBeanFactory beanFactory;
|
||||
|
||||
/**
|
||||
* Get all beans implementing the CollectorPlugin interface.
|
||||
*
|
||||
* @return the set of eu.dnetlib.data.collector.plugin.CollectorPlugin(s)
|
||||
*/
|
||||
public Collection<CollectorPlugin> getAll() {
|
||||
return beanFactory.getBeansOfType(CollectorPlugin.class).values();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setBeanFactory(final BeanFactory beanFactory) throws BeansException {
|
||||
this.beanFactory = (ListableBeanFactory) beanFactory;
|
||||
}
|
||||
|
||||
public ListableBeanFactory getBeanFactory() {
|
||||
return beanFactory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get given CollectorPlugin or throws exception.
|
||||
*
|
||||
* @param protocol the given protocol
|
||||
* @return a CollectorPlugin compatible with the given protocol
|
||||
* @throws CollectorServiceException when no suitable plugin is found
|
||||
*/
|
||||
public CollectorPlugin get(final String protocol) throws CollectorServiceException {
|
||||
for (CollectorPlugin cp : getAll()) {
|
||||
if (protocol.equalsIgnoreCase(cp.getProtocol())) {
|
||||
return cp;
|
||||
}
|
||||
}
|
||||
throw new CollectorServiceException("plugin not found for protocol: " + protocol);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.data.collector;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
import javax.xml.ws.wsaddressing.W3CEndpointReference;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorService;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolDescriptor;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameter;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
|
||||
import eu.dnetlib.enabling.resultset.IterableResultSetFactory;
|
||||
import eu.dnetlib.enabling.tools.AbstractBaseService;
|
||||
|
||||
public class CollectorServiceImpl extends AbstractBaseService implements CollectorService {
|
||||
|
||||
@Resource
|
||||
private CollectorPluginEnumerator collectorPluginEnumerator;
|
||||
|
||||
@Resource
|
||||
private IterableResultSetFactory iterableResultSetFactory;
|
||||
|
||||
@Override
|
||||
public W3CEndpointReference collect(final InterfaceDescriptor ifDescriptor) throws CollectorServiceException {
|
||||
return dateRangeCollect(ifDescriptor, null, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public W3CEndpointReference dateRangeCollect(
|
||||
final InterfaceDescriptor ifDescriptor, final String from, final String until)
|
||||
throws CollectorServiceException {
|
||||
final CollectorPlugin plugin = collectorPluginEnumerator.get(ifDescriptor.getProtocol());
|
||||
|
||||
if (!verifyParams(ifDescriptor.getParams().keySet(), Sets.newHashSet(plugin.listNameParameters()))) { throw new CollectorServiceException(
|
||||
"Invalid parameters, valid: " + plugin.listNameParameters() + ", current: " + ifDescriptor.getParams().keySet()); }
|
||||
|
||||
final Iterable<String> iter = plugin.collect(ifDescriptor, from, until);
|
||||
|
||||
return iterableResultSetFactory.createIterableResultSet(iter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ProtocolDescriptor> listProtocols() {
|
||||
final List<ProtocolDescriptor> list = Lists.newArrayList();
|
||||
for (CollectorPlugin plugin : collectorPluginEnumerator.getAll()) {
|
||||
list.add(plugin.getProtocolDescriptor());
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ProtocolParameterValue> listValidValuesForParam(final String protocol,
|
||||
final String baseUrl,
|
||||
final String param,
|
||||
final Map<String, String> otherParams) throws CollectorServiceException {
|
||||
final CollectorPlugin plugin = collectorPluginEnumerator.get(protocol);
|
||||
|
||||
for (ProtocolParameter pp : plugin.getProtocolDescriptor().getParams()) {
|
||||
if (pp.getName().equals(param) && pp.isFunctionPopulated()) { return pp.getPopulateFunction().findValues(baseUrl, otherParams); }
|
||||
}
|
||||
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private boolean verifyParams(final Set<String> curr, final Set<String> valid) {
|
||||
return valid.containsAll(curr);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.data.collector.functions;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.oaisets.OaiSetsIteratorFactory;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
|
||||
|
||||
public class ListOaiSetsFunction implements ParamValuesFunction {
|
||||
|
||||
private OaiSetsIteratorFactory oaiSetsIteratorFactory;
|
||||
|
||||
@Override
|
||||
public List<ProtocolParameterValue> findValues(final String baseUrl, final Map<String, String> params) {
|
||||
final SAXReader reader = new SAXReader();
|
||||
|
||||
final Iterator<ProtocolParameterValue> iter = Iterators.transform(oaiSetsIteratorFactory.newIterator(baseUrl),
|
||||
new Function<String, ProtocolParameterValue>() {
|
||||
|
||||
@Override
|
||||
public ProtocolParameterValue apply(final String s) {
|
||||
try {
|
||||
final Document doc = reader.read(new StringReader(s));
|
||||
final String id = doc.valueOf("//*[local-name()='setSpec']");
|
||||
final String name = doc.valueOf("//*[local-name()='setName']");
|
||||
return new ProtocolParameterValue(id,
|
||||
(StringUtils.isBlank(name) || name.equalsIgnoreCase(id)) ? id : id + " - name: \"" + name + "\"");
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException("Error in ListSets", e);
|
||||
}
|
||||
}
|
||||
});
|
||||
return Lists.newArrayList(iter);
|
||||
}
|
||||
|
||||
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() {
|
||||
return oaiSetsIteratorFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) {
|
||||
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import eu.dnetlib.miscutils.iterators.xml.XMLIterator;
|
||||
|
||||
public abstract class AbstractSplittedRecordPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
final String element = interfaceDescriptor.getParams().get("splitOnElement");
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
|
||||
if (StringUtils.isBlank(element)) { throw new CollectorServiceException("Param 'splitOnElement' is null or empty"); }
|
||||
|
||||
final BufferedInputStream bis = getBufferedInputStream(baseUrl);
|
||||
|
||||
return new Iterable<String>() {
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new XMLIterator(element, bis);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
abstract protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException;
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.net.URL;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
public class ClasspathCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
|
||||
try {
|
||||
return new BufferedInputStream(getClass().getResourceAsStream(new URL(baseUrl).getPath()));
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceException("Error dowloading url: " + baseUrl);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,149 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.StringEscapeUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Please use eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin instead
|
||||
*/
|
||||
@Deprecated
|
||||
public class FileCSVCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private static final Log log = LogFactory.getLog(FileCSVCollectorPlugin.class);
|
||||
|
||||
class FileCSVIterator implements Iterator<String> {
|
||||
|
||||
private String next;
|
||||
|
||||
private BufferedReader reader;
|
||||
|
||||
private String separator;
|
||||
private String quote;
|
||||
|
||||
public FileCSVIterator(final BufferedReader reader, final String separator, final String quote) {
|
||||
this.reader = reader;
|
||||
this.separator = separator;
|
||||
this.quote = quote;
|
||||
next = calculateNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
final String s = next;
|
||||
next = calculateNext();
|
||||
return s;
|
||||
}
|
||||
|
||||
private String calculateNext() {
|
||||
try {
|
||||
final Document document = DocumentHelper.createDocument();
|
||||
final Element root = document.addElement("csvRecord");
|
||||
|
||||
String newLine = reader.readLine();
|
||||
|
||||
// FOR SOME FILES IT RETURN NULL ALSO IF THE FILE IS NOT READY DONE
|
||||
if (newLine == null) {
|
||||
newLine = reader.readLine();
|
||||
}
|
||||
if (newLine == null) {
|
||||
log.info("there is no line, closing RESULT SET");
|
||||
|
||||
reader.close();
|
||||
return null;
|
||||
}
|
||||
final String[] currentRow = newLine.split(separator);
|
||||
|
||||
if (currentRow != null) {
|
||||
|
||||
for (int i = 0; i < currentRow.length; i++) {
|
||||
final String hAttribute = (headers != null) && (i < headers.length) ? headers[i] : "column" + i;
|
||||
|
||||
final Element row = root.addElement("column");
|
||||
if (i == identifierNumber) {
|
||||
row.addAttribute("isID", "true");
|
||||
}
|
||||
final String value = StringUtils.isBlank(quote) ? currentRow[i] : StringUtils.strip(currentRow[i], quote);
|
||||
|
||||
row.addAttribute("name", hAttribute).addText(value);
|
||||
}
|
||||
return document.asXML();
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
log.error("Error calculating next csv element", e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private String[] headers = null;
|
||||
private int identifierNumber;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String header = interfaceDescriptor.getParams().get("header");
|
||||
final String separator = StringEscapeUtils.unescapeJava(interfaceDescriptor.getParams().get("separator"));
|
||||
final String quote = interfaceDescriptor.getParams().get("quote");
|
||||
|
||||
identifierNumber = Integer.parseInt(interfaceDescriptor.getParams().get("identifier"));
|
||||
URL u = null;
|
||||
try {
|
||||
u = new URL(interfaceDescriptor.getBaseUrl());
|
||||
} catch (final MalformedURLException e1) {
|
||||
throw new CollectorServiceException(e1);
|
||||
}
|
||||
final String baseUrl = u.getPath();
|
||||
|
||||
log.info("base URL = " + baseUrl);
|
||||
|
||||
try {
|
||||
|
||||
final BufferedReader br = new BufferedReader(new InputStreamReader(new BOMInputStream(new FileInputStream(baseUrl))));
|
||||
|
||||
if ((header != null) && "true".equals(header.toLowerCase())) {
|
||||
final String[] tmpHeader = br.readLine().split(separator);
|
||||
if (StringUtils.isNotBlank(quote)) {
|
||||
int i = 0;
|
||||
headers = new String[tmpHeader.length];
|
||||
for (final String h : tmpHeader) {
|
||||
headers[i] = StringUtils.strip(h, quote);
|
||||
i++;
|
||||
}
|
||||
} else headers = tmpHeader;
|
||||
}
|
||||
return () -> new FileCSVIterator(br, separator, quote);
|
||||
} catch (final Exception e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.net.URL;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
|
||||
try {
|
||||
return new BufferedInputStream(new FileInputStream(new URL(baseUrl).getPath()));
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceException("Error reading file " + baseUrl, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.net.URL;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
|
||||
|
||||
try {
|
||||
GZIPInputStream stream = new GZIPInputStream(new FileInputStream(new URL(baseUrl).getPath()));
|
||||
return new BufferedInputStream(stream);
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVParser;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
|
||||
/**
|
||||
* The Class HttpCSVCollectorPlugin.
|
||||
*/
|
||||
public class HttpCSVCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HttpCSVCollectorPlugin.class);
|
||||
|
||||
public static final String UTF8_BOM = "\uFEFF";
|
||||
|
||||
/**
|
||||
* The Class HTTPCSVIterator.
|
||||
*/
|
||||
class HTTPCSVIterator implements Iterable<String> {
|
||||
|
||||
/** The descriptor. */
|
||||
private InterfaceDescriptor descriptor;
|
||||
|
||||
/**
|
||||
* Instantiates a new HTTPCSV iterator.
|
||||
*
|
||||
* @param descriptor
|
||||
* the descriptor
|
||||
*/
|
||||
public HTTPCSVIterator(final InterfaceDescriptor descriptor) {
|
||||
this.descriptor = descriptor;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator.
|
||||
*
|
||||
* @return the iterator
|
||||
*/
|
||||
@SuppressWarnings("resource")
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
|
||||
try {
|
||||
final String separator = descriptor.getParams().get("separator");
|
||||
final String identifier = descriptor.getParams().get("identifier");
|
||||
final String quote = descriptor.getParams().get("quote");
|
||||
final URL url = new URL(descriptor.getBaseUrl());
|
||||
long nLines = 0;
|
||||
|
||||
// FIX
|
||||
// This code should skip the lines with invalid quotes
|
||||
final File tempFile = File.createTempFile("csv-", ".tmp");
|
||||
try (InputStream is = url.openConnection().getInputStream();
|
||||
BOMInputStream bomIs = new BOMInputStream(is);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
|
||||
FileWriter fw = new FileWriter(tempFile)) {
|
||||
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
|
||||
fw.write(line);
|
||||
fw.write("\n");
|
||||
nLines++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// END FIX
|
||||
|
||||
final CSVFormat format = CSVFormat.EXCEL
|
||||
.withHeader()
|
||||
.withDelimiter(separator.equals("\\t") || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
|
||||
.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
|
||||
.withTrim();
|
||||
|
||||
final CSVParser parser = new CSVParser(new FileReader(tempFile), format);
|
||||
final Set<String> headers = parser.getHeaderMap().keySet();
|
||||
|
||||
final long nRecords = nLines - 1;
|
||||
|
||||
return Iterators.transform(parser.iterator(), input -> {
|
||||
try {
|
||||
final Document document = DocumentHelper.createDocument();
|
||||
final Element root = document.addElement("csvRecord");
|
||||
for (final String key : headers) {
|
||||
final Element row = root.addElement("column");
|
||||
row.addAttribute("name", key).addText(XmlCleaner.cleanAllEntities(input.get(key)));
|
||||
if (key.equals(identifier)) {
|
||||
row.addAttribute("isID", "true");
|
||||
}
|
||||
}
|
||||
|
||||
return document.asXML();
|
||||
} finally {
|
||||
log.debug(tempFile.getAbsolutePath());
|
||||
if (parser.getRecordNumber() == nRecords) {
|
||||
log.debug("DELETING " + tempFile.getAbsolutePath());
|
||||
tempFile.delete();
|
||||
}
|
||||
}
|
||||
});
|
||||
} catch (final Exception e) {
|
||||
log.error("Error iterating csv lines", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String,
|
||||
* java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor descriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
|
||||
|
||||
return new HTTPCSVIterator(descriptor);
|
||||
}
|
||||
|
||||
public boolean verifyQuotes(final String line, final char separator) {
|
||||
final char[] cs = line.trim().toCharArray();
|
||||
boolean inField = false;
|
||||
boolean skipNext = false;
|
||||
for (int i = 0; i < cs.length; i++) {
|
||||
if (skipNext) {
|
||||
skipNext = false;
|
||||
} else if (inField) {
|
||||
if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
|
||||
inField = false;
|
||||
} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
|
||||
if ((cs[i + 1] == '\"')) {
|
||||
skipNext = true;
|
||||
} else {
|
||||
log.warn("Skipped invalid line: " + line);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
|
||||
inField = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inField) {
|
||||
log.warn("Skipped invalid line: " + line);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class HttpCollectorPlugin extends AbstractSplittedRecordPlugin {
|
||||
|
||||
@Override
|
||||
protected BufferedInputStream getBufferedInputStream(final String baseUrl) throws CollectorServiceException {
|
||||
final HttpGet method = new HttpGet(baseUrl);
|
||||
|
||||
try(CloseableHttpResponse response = HttpClients.createDefault().execute(method)) {
|
||||
|
||||
int responseCode = response.getStatusLine().getStatusCode();
|
||||
|
||||
if (HttpStatus.SC_OK != responseCode) {
|
||||
throw new CollectorServiceException("Error " + responseCode + " dowloading url: " + baseUrl);
|
||||
}
|
||||
|
||||
byte[] content = IOUtils.toByteArray(response.getEntity().getContent());
|
||||
|
||||
try(InputStream in = new ByteArrayInputStream(content)) {
|
||||
return new BufferedInputStream(in);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceException("Error dowloading url: " + baseUrl);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,224 @@
|
|||
package eu.dnetlib.data.collector.plugins;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.*;
|
||||
import java.security.GeneralSecurityException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import javax.net.ssl.HttpsURLConnection;
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.CollectorPluginErrorLogList;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* @author jochen, michele, andrea
|
||||
*/
|
||||
public class HttpConnector {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
||||
|
||||
private int maxNumberOfRetry = 6;
|
||||
private int defaultDelay = 120; // seconds
|
||||
private int readTimeOut = 120; // seconds
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector() {
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource
|
||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl) throws CollectorServiceException {
|
||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content as a stream via HTTP GET
|
||||
*
|
||||
* @param requestUrl the URL
|
||||
* @return the content of the downloaded resource as InputStream
|
||||
* @throws CollectorServiceException when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException {
|
||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws CollectorServiceException {
|
||||
try {
|
||||
InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
try {
|
||||
return IOUtils.toString(s);
|
||||
} catch (IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
||||
}
|
||||
finally{
|
||||
IOUtils.closeQuietly(s);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws CollectorServiceException {
|
||||
|
||||
if (retryNumber > maxNumberOfRetry) { throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); }
|
||||
|
||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
||||
try {
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
||||
Thread.sleep(retryAfter * 1000);
|
||||
errorList.add("503 Service Unavailable");
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) || (urlConn.getResponseCode()
|
||||
== HttpURLConnection.HTTP_MOVED_TEMP)) {
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.debug("The requested url has been moved to " + newUrl);
|
||||
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
||||
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||
|
||||
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (String v : e.getValue()) {
|
||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) {
|
||||
return Integer
|
||||
.parseInt(headerMap.get(key).get(0)) + 10;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorServiceException {
|
||||
for (String key : headerMap.keySet()) {
|
||||
if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); }
|
||||
}
|
||||
throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
/**
|
||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
||||
*/
|
||||
public void initTrustManager() {
|
||||
final X509TrustManager tm = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
try {
|
||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
||||
ctx.init(null, new TrustManager[] { tm }, null);
|
||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
||||
} catch (GeneralSecurityException e) {
|
||||
log.fatal(e);
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int getMaxNumberOfRetry() {
|
||||
return maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public int getDefaultDelay() {
|
||||
return defaultDelay;
|
||||
}
|
||||
|
||||
public void setDefaultDelay(final int defaultDelay) {
|
||||
this.defaultDelay = defaultDelay;
|
||||
}
|
||||
|
||||
public int getReadTimeOut() {
|
||||
return readTimeOut;
|
||||
}
|
||||
|
||||
public void setReadTimeOut(final int readTimeOut) {
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.targz;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Collector pluging for collecting a .tar.gz folder of records
|
||||
*
|
||||
* @author andrea
|
||||
*
|
||||
*/
|
||||
public class TarGzCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
return new TarGzIterable(interfaceDescriptor);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.targz;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterators;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* The Class TarGzIterable.
|
||||
*
|
||||
* @author Andrea
|
||||
*/
|
||||
public class TarGzIterable implements Iterable<String> {
|
||||
|
||||
/** The path to tar.gz archive. */
|
||||
private File tarGzFile;
|
||||
|
||||
public TarGzIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException {
|
||||
try {
|
||||
final String tarGzPath = interfaceDescriptor.getBaseUrl();
|
||||
URL tarGzUrl = new URL(tarGzPath);
|
||||
this.tarGzFile = new File(tarGzUrl.getPath());
|
||||
if (!tarGzFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", tarGzFile.getPath())); }
|
||||
} catch (MalformedURLException e) {
|
||||
throw new CollectorServiceException("TarGz collector failed! ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
final TarGzIterator tgzIterator = new TarGzIterator(tarGzFile.getAbsolutePath());
|
||||
return Iterators.transform(tgzIterator, new Function<String, String>() {
|
||||
|
||||
@Override
|
||||
public String apply(final String inputRecord) {
|
||||
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.targz;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class TarGzIterator implements Iterator<String> {
|
||||
|
||||
/** The Constant log. */
|
||||
private static final Log log = LogFactory.getLog(TarGzIterator.class);
|
||||
|
||||
private TarArchiveInputStream tarInputStream;
|
||||
private String current;
|
||||
|
||||
public TarGzIterator(final String tarGzPath) {
|
||||
try {
|
||||
this.tarInputStream = new TarArchiveInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(tarGzPath))));
|
||||
this.current = findNext();
|
||||
} catch (FileNotFoundException e) {
|
||||
log.error("Tar.gz file not found: " + tarGzPath, e);
|
||||
} catch (IOException e) {
|
||||
log.error("Problem opening tar.gz file " + tarGzPath, e);
|
||||
}
|
||||
}
|
||||
|
||||
public TarGzIterator(final File tarGzFile) {
|
||||
try {
|
||||
this.tarInputStream = new TarArchiveInputStream(new BufferedInputStream(new GZIPInputStream(new FileInputStream(tarGzFile))));
|
||||
this.current = findNext();
|
||||
} catch (FileNotFoundException e) {
|
||||
log.error("Tar.gz file not found: " + tarGzFile.getAbsolutePath(), e);
|
||||
} catch (IOException e) {
|
||||
log.error("Problem opening tar.gz file " + tarGzFile.getAbsolutePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return current != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String ret = new String(current);
|
||||
current = findNext();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
private synchronized String findNext() {
|
||||
TarArchiveEntry entry = null;
|
||||
try {
|
||||
while (null != (entry = tarInputStream.getNextTarEntry()) && !entry.isFile()) {
|
||||
log.debug("Skipping TAR entry " + entry.getName());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Error during tar.gz extraction", e);
|
||||
}
|
||||
|
||||
if (entry == null) {
|
||||
return null;
|
||||
} else {
|
||||
log.debug("Extracting " + entry.getName());
|
||||
byte[] content = new byte[(int) entry.getSize()];
|
||||
try {
|
||||
tarInputStream.read(content, 0, content.length);
|
||||
return new String(content);
|
||||
} catch (IOException e) {
|
||||
log.error("Impossible to extract file " + entry.getName(), e);
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.zip;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Collector pluging for collecting a zipped folder of records
|
||||
*
|
||||
* @author Andrea
|
||||
*
|
||||
*/
|
||||
public class ZipCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
return new ZipIterable(interfaceDescriptor);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.zip;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterators;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Andrea
|
||||
*
|
||||
*/
|
||||
public class ZipIterable implements Iterable<String> {
|
||||
|
||||
/** The path to .zip archive. */
|
||||
private File zipFile;
|
||||
|
||||
public ZipIterable(final InterfaceDescriptor interfaceDescriptor) throws CollectorServiceException {
|
||||
try {
|
||||
final String zipPath = interfaceDescriptor.getBaseUrl();
|
||||
URL zipUrl = new URL(zipPath);
|
||||
this.zipFile = new File(zipUrl.getPath());
|
||||
if (!zipFile.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", zipFile.getPath())); }
|
||||
} catch (MalformedURLException e) {
|
||||
throw new CollectorServiceException("Zip collector failed! ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
final ZipIterator zipIterator = new ZipIterator(zipFile.getAbsolutePath());
|
||||
return Iterators.transform(zipIterator, new Function<String, String>() {
|
||||
|
||||
@Override
|
||||
public String apply(final String inputRecord) {
|
||||
return XmlCleaner.cleanAllEntities(inputRecord.startsWith("\uFEFF") ? inputRecord.substring(1) : inputRecord);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package eu.dnetlib.data.collector.plugins.archive.zip;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Iterator;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class ZipIterator implements Iterator<String> {
|
||||
|
||||
/** The Constant log. */
|
||||
private static final Log log = LogFactory.getLog(ZipIterator.class);
|
||||
|
||||
ZipFile zipFile;
|
||||
Enumeration<? extends ZipEntry> entries;
|
||||
private String current;
|
||||
|
||||
public ZipIterator(final String zipPath) {
|
||||
try {
|
||||
this.zipFile = new ZipFile(zipPath);
|
||||
this.entries = zipFile.entries();
|
||||
this.current = findNext();
|
||||
} catch (IOException e) {
|
||||
log.error("Problems opening the .zip file " + zipPath, e);
|
||||
}
|
||||
}
|
||||
|
||||
public ZipIterator(final File file) {
|
||||
try {
|
||||
this.zipFile = new ZipFile(file);
|
||||
this.entries = zipFile.entries();
|
||||
this.current = findNext();
|
||||
} catch (IOException e) {
|
||||
log.error("Problems opening the .zip file " + zipFile.getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return current != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String ret = new String(current);
|
||||
current = findNext();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
private synchronized String findNext() {
|
||||
ZipEntry entry = null;
|
||||
while (entries.hasMoreElements() && (entry = entries.nextElement()).isDirectory()) {
|
||||
log.debug("Skipping Zip entry " + entry.getName());
|
||||
}
|
||||
|
||||
if (entry == null) {
|
||||
return null;
|
||||
} else {
|
||||
log.debug("Extracting " + entry.getName());
|
||||
try {
|
||||
InputStream stream = zipFile.getInputStream(entry);
|
||||
return IOUtils.toString(stream);
|
||||
} catch (IOException e) {
|
||||
log.error("Problems extracting entry " + entry.getName(), e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.data.collector.plugins.datacite;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Date;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class DataciteCollectorPlugin extends AbstractCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DataciteCollectorPlugin.class);
|
||||
|
||||
private DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd");
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String fromDate, String untilDate) throws CollectorServiceException {
|
||||
|
||||
String baseurl = interfaceDescriptor.getBaseUrl();
|
||||
if (StringUtils.isBlank(baseurl)) throw new CollectorServiceException("baseUrl cannot be empty");
|
||||
long timestamp = 0;
|
||||
if (StringUtils.isNotBlank(fromDate)) {
|
||||
try {
|
||||
Date date = org.apache.commons.lang.time.DateUtils.parseDate(
|
||||
fromDate,
|
||||
new String[] { "yyyy-MM-dd", "yyyy-MM-dd'T'HH:mm:ssXXX", "yyyy-MM-dd'T'HH:mm:ss.SSSX", "yyyy-MM-dd'T'HH:mm:ssZ",
|
||||
"yyyy-MM-dd'T'HH:mm:ss.SX" });
|
||||
//timestamp =parsed.getTime() /1000;
|
||||
timestamp = date.toInstant().toEpochMilli() / 1000;
|
||||
log.info("Querying for Datacite records from timestamp " + timestamp + " (date was " + fromDate + ")");
|
||||
|
||||
} catch (ParseException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
final long finalTimestamp = timestamp;
|
||||
return () -> {
|
||||
try {
|
||||
return new DataciteESIterator(finalTimestamp, baseurl);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
package eu.dnetlib.data.collector.plugins.datacite;
|
||||
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Iterator;
|
||||
import java.util.Objects;
|
||||
import java.util.Queue;
|
||||
import java.util.zip.DataFormatException;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import eu.dnetlib.data.collector.plugins.datacite.schema.DataciteSchema;
|
||||
import eu.dnetlib.data.collector.plugins.datacite.schema.Result;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class DataciteESIterator implements Iterator<String> {
|
||||
|
||||
|
||||
private final long timestamp;
|
||||
|
||||
private String scrollId;
|
||||
|
||||
private Queue<String> currentPage;
|
||||
|
||||
private final Gson g = new GsonBuilder().create();
|
||||
|
||||
private String baseURL = "http://ip-90-147-167-25.ct1.garrservices.it:5000";
|
||||
|
||||
private static final String START_PATH = "new_scan";
|
||||
private static final String NEXT_PATH = "scan/%s";
|
||||
|
||||
|
||||
public DataciteESIterator(long timestamp, String baseUrl) throws Exception {
|
||||
this.timestamp = timestamp;
|
||||
this.baseURL = baseUrl;
|
||||
currentPage = new ArrayDeque<>();
|
||||
startRequest();
|
||||
}
|
||||
|
||||
private static String decompression(final Result r) {
|
||||
try {
|
||||
byte[] byteArray = Base64.decodeBase64(r.getBody().getBytes());
|
||||
Inflater decompresser = new Inflater();
|
||||
decompresser.setInput(byteArray);
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
||||
byte[] buffer = new byte[8192];
|
||||
while (!decompresser.finished()) {
|
||||
int size = decompresser.inflate(buffer);
|
||||
bos.write(buffer, 0, size);
|
||||
}
|
||||
byte[] unzippeddata = bos.toByteArray();
|
||||
decompresser.end();
|
||||
|
||||
return new String(unzippeddata);
|
||||
} catch (DataFormatException e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void fillQueue(final String hits) {
|
||||
if (StringUtils.isBlank(hits) || "[]".equalsIgnoreCase(hits.trim()))
|
||||
return;
|
||||
try {
|
||||
DataciteSchema datacitepage = g.fromJson(hits, DataciteSchema.class);
|
||||
this.scrollId = datacitepage.getScrollId();
|
||||
datacitepage.getResult().stream().map(DataciteESIterator::decompression).filter(Objects::nonNull).forEach(this.currentPage::add);
|
||||
} catch (Throwable e) {
|
||||
System.out.println(hits);
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void startRequest() throws Exception {
|
||||
String url = baseURL+"/"+START_PATH;
|
||||
final URL startUrl = new URL(timestamp >0 ? url + "?timestamp="+timestamp : url);
|
||||
fillQueue(IOUtils.toString(startUrl.openStream()));
|
||||
}
|
||||
|
||||
private void getNextPage() throws IOException {
|
||||
String url = baseURL+"/"+NEXT_PATH;
|
||||
final URL startUrl = new URL(String.format(url,scrollId));
|
||||
fillQueue(IOUtils.toString(startUrl.openStream()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentPage.size() >0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
|
||||
if (currentPage.size() == 0) {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
String nextItem = currentPage.remove();
|
||||
if (currentPage.size() == 0) {
|
||||
try {
|
||||
getNextPage();
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
return nextItem;
|
||||
}
|
||||
|
||||
public String getBaseURL() {
|
||||
return baseURL;
|
||||
}
|
||||
|
||||
public void setBaseURL(final String baseURL) {
|
||||
this.baseURL = baseURL;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
|
||||
package eu.dnetlib.data.collector.plugins.datacite.schema;
|
||||
|
||||
import java.util.List;
|
||||
import com.google.gson.annotations.Expose;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
|
||||
public class DataciteSchema {
|
||||
|
||||
@SerializedName("counter")
|
||||
@Expose
|
||||
private Integer counter;
|
||||
@SerializedName("result")
|
||||
@Expose
|
||||
private List<Result> result = null;
|
||||
@SerializedName("scroll_id")
|
||||
@Expose
|
||||
private String scrollId;
|
||||
@SerializedName("total")
|
||||
@Expose
|
||||
private Integer total;
|
||||
|
||||
public Integer getCounter() {
|
||||
return counter;
|
||||
}
|
||||
|
||||
public void setCounter(Integer counter) {
|
||||
this.counter = counter;
|
||||
}
|
||||
|
||||
public List<Result> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setResult(List<Result> result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public String getScrollId() {
|
||||
return scrollId;
|
||||
}
|
||||
|
||||
public void setScrollId(String scrollId) {
|
||||
this.scrollId = scrollId;
|
||||
}
|
||||
|
||||
public Integer getTotal() {
|
||||
return total;
|
||||
}
|
||||
|
||||
public void setTotal(Integer total) {
|
||||
this.total = total;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.data.collector.plugins.datacite.schema;
|
||||
|
||||
import com.google.gson.annotations.Expose;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
|
||||
public class Result {
|
||||
|
||||
@SerializedName("body")
|
||||
@Expose
|
||||
private String body;
|
||||
@SerializedName("id")
|
||||
@Expose
|
||||
private String id;
|
||||
@SerializedName("originalId")
|
||||
@Expose
|
||||
private String originalId;
|
||||
@SerializedName("timestamp")
|
||||
@Expose
|
||||
private Integer timestamp;
|
||||
|
||||
public String getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
public void setBody(String body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getOriginalId() {
|
||||
return originalId;
|
||||
}
|
||||
|
||||
public void setOriginalId(String originalId) {
|
||||
this.originalId = originalId;
|
||||
}
|
||||
|
||||
public Integer getTimestamp() {
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
public void setTimestamp(Integer timestamp) {
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* The Class DatasetsByProjectIterator.
|
||||
*/
|
||||
public class DatasetsByJournalIterator implements Iterable<String>, Iterator<String> {
|
||||
|
||||
/** The current iterator. */
|
||||
private Iterator<String> currentIterator;
|
||||
|
||||
/** The current project. */
|
||||
private PangaeaJournalInfo currentJournal;
|
||||
|
||||
private Iterator<PangaeaJournalInfo> inputIterator;
|
||||
|
||||
/** The logger. */
|
||||
private static final Log log = LogFactory.getLog(DatasetsByProjectIterator.class);
|
||||
|
||||
public DatasetsByJournalIterator(final Iterator<PangaeaJournalInfo> iterator) {
|
||||
this.inputIterator = iterator;
|
||||
this.currentJournal = extractNextLine();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
// CASE WHEN WE REACH THE LAST ITEM ON CSV
|
||||
// OR WE HAD SOME PROBLEM ON GET NEXT CSV ITEM
|
||||
if (this.currentJournal == null) { return false; }
|
||||
// IN THIS CASE WE HAVE ANOTHER DATASETS
|
||||
// FOR THE CURRENT PROJECT AND RETURN TRUE
|
||||
if (currentIterator != null && currentIterator.hasNext()) { return true; }
|
||||
// OTHERWISE WE FINISHED TO ITERATE THE CURRENT
|
||||
// SETS OF DATASETS FOR A PARTICULAR PROJECT
|
||||
// SO WE HAVE TO RETRIEVE THE NEXT ITERATOR WITH
|
||||
// ITEMS
|
||||
this.currentJournal = extractNextLine();
|
||||
|
||||
while (this.currentJournal != null) {
|
||||
currentIterator = getNextIterator();
|
||||
// IF THE NEXT ITERATOR HAS ITEMS RETURN YES
|
||||
// OTHERWISE THE CICLE CONTINUE
|
||||
if (currentIterator.hasNext()) { return true; }
|
||||
this.currentJournal = extractNextLine();
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
return this.currentIterator.next();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#remove()
|
||||
*/
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
if (this.currentJournal != null) {
|
||||
currentIterator = getNextIterator();
|
||||
return this;
|
||||
}
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
private Iterator<String> getNextIterator() {
|
||||
QueryField q = new QueryField();
|
||||
RequestField r = new RequestField();
|
||||
r.setQuery(q);
|
||||
q.getTerm().put("ft-techkeyword", this.currentJournal.getJournalId());
|
||||
|
||||
return new DatasetsIterator(r, "", this.currentJournal).iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract next line.
|
||||
*
|
||||
* @return the map
|
||||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
private PangaeaJournalInfo extractNextLine() {
|
||||
|
||||
if (this.inputIterator.hasNext() == false) { return null; }
|
||||
return this.inputIterator.next();
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,158 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
/**
|
||||
* The Class DatasetsByProjectIterator.
|
||||
*/
|
||||
public class DatasetsByProjectIterator implements Iterable<String>, Iterator<String> {
|
||||
|
||||
private static final String SPLIT_REGEX = ";";
|
||||
|
||||
/** The project id key. */
|
||||
public static String PROJECT_ID_KEY = "id";
|
||||
|
||||
/** The project name key. */
|
||||
public static String PROJECT_NAME_KEY = "name";
|
||||
|
||||
/** The project corda id key. */
|
||||
public static String PROJECT_CORDA_ID_KEY = "corda_id";
|
||||
|
||||
/** The current iterator. */
|
||||
private Iterator<String> currentIterator;
|
||||
|
||||
/** The csv reader. */
|
||||
private BufferedReader csvReader;
|
||||
|
||||
/** The current project. */
|
||||
private Map<String, String> currentProject;
|
||||
|
||||
/** The logger. */
|
||||
private static final Log log = LogFactory.getLog(DatasetsByProjectIterator.class);
|
||||
|
||||
/**
|
||||
* Instantiates a new datasets by project iterator.
|
||||
*
|
||||
* @param csvInputStream
|
||||
* the csv input stream
|
||||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
public DatasetsByProjectIterator(final InputStreamReader csvInputStream) throws IOException {
|
||||
this.csvReader = new BufferedReader(csvInputStream);
|
||||
this.currentProject = extractNextLine();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
// CASE WHEN WE REACH THE LAST ITEM ON CSV
|
||||
// OR WE HAD SOME PROBLEM ON GET NEXT CSV ITEM
|
||||
if (this.currentProject == null) { return false; }
|
||||
// IN THIS CASE WE HAVE ANOTHER DATASETS
|
||||
// FOR THE CURRENT PROJECT AND RETURN TRUE
|
||||
if (currentIterator != null && currentIterator.hasNext()) { return true; }
|
||||
// OTHERWISE WE FINISHED TO ITERATE THE CURRENT
|
||||
// SETS OF DATASETS FOR A PARTICULAR PROJECT
|
||||
// SO WE HAVE TO RETRIEVE THE NEXT ITERATOR WITH
|
||||
// ITEMS
|
||||
this.currentProject = extractNextLine();
|
||||
|
||||
while (this.currentProject != null) {
|
||||
currentIterator = getNextIterator();
|
||||
// IF THE NEXT ITERATOR HAS ITEMS RETURN YES
|
||||
// OTHERWISE THE CICLE CONTINUE
|
||||
if (currentIterator.hasNext()) { return true; }
|
||||
this.currentProject = extractNextLine();
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
return this.currentIterator.next();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#remove()
|
||||
*/
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
if (this.currentProject != null) {
|
||||
currentIterator = getNextIterator();
|
||||
return this;
|
||||
}
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
private Iterator<String> getNextIterator() {
|
||||
QueryField q = new QueryField();
|
||||
RequestField r = new RequestField();
|
||||
r.setQuery(q);
|
||||
q.getTerm().put("ft-techkeyword", this.currentProject.get(PROJECT_ID_KEY));
|
||||
return new DatasetsIterator(r, this.currentProject.get(PROJECT_CORDA_ID_KEY), null).iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract next line.
|
||||
*
|
||||
* @return the map
|
||||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
private Map<String, String> extractNextLine() {
|
||||
String line;
|
||||
try {
|
||||
line = this.csvReader.readLine();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
// WE REACH THE END OF THE CSV
|
||||
if (line == null) { return null; }
|
||||
log.debug("splitting line: " + line);
|
||||
String[] values = line.split(SPLIT_REGEX);
|
||||
if (values == null || values.length != 4) {
|
||||
log.error("Error on splitting line, the length must be 4");
|
||||
return null;
|
||||
}
|
||||
int id = Integer.parseInt(values[0]);
|
||||
String project_name = values[2];
|
||||
String cordaId = values[3];
|
||||
Map<String, String> splittedMap = Maps.newHashMap();
|
||||
splittedMap.put(PROJECT_CORDA_ID_KEY, cordaId);
|
||||
splittedMap.put(PROJECT_ID_KEY, "project" + id);
|
||||
splittedMap.put(PROJECT_NAME_KEY, project_name);
|
||||
log.debug(String.format("found project %s with id Corda: %s and id for API: %s", project_name, cordaId, "project" + id));
|
||||
return splittedMap;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
public class DatasetsByProjectPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
try {
|
||||
URL url = new URL(interfaceDescriptor.getBaseUrl());
|
||||
url.openConnection();
|
||||
InputStreamReader reader = new InputStreamReader(url.openStream());
|
||||
DatasetsByProjectIterator iterator = new DatasetsByProjectIterator(reader);
|
||||
return iterator;
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceException("OOOPS something bad happen on creating iterator ", e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringEscapeUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
/**
|
||||
* The Class JournalIterator.
|
||||
*/
|
||||
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
|
||||
|
||||
/** The logger. */
|
||||
private static final Log log = LogFactory.getLog(DatasetsIterator.class);
|
||||
|
||||
/** The base url template. */
|
||||
private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
|
||||
|
||||
/** The journal id. */
|
||||
private String journalId = "";
|
||||
|
||||
/** The journal name. */
|
||||
private String journalName = "";
|
||||
|
||||
/** The journal issn. */
|
||||
private String journalISSN = "";
|
||||
|
||||
/** The openaire datasource. */
|
||||
private String openaireDatasource = "";
|
||||
|
||||
/** The total. */
|
||||
private long total;
|
||||
|
||||
/** The from. */
|
||||
private int from;
|
||||
|
||||
/** The current iterator. */
|
||||
private int currentIterator;
|
||||
|
||||
/** The current response. */
|
||||
private ElasticSearchResponse currentResponse;
|
||||
|
||||
/** The request. */
|
||||
private RequestField request;
|
||||
|
||||
/** The default size. */
|
||||
private static int DEFAULT_SIZE = 10;
|
||||
|
||||
private String projectCordaId;
|
||||
|
||||
private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
|
||||
+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>";
|
||||
|
||||
/**
|
||||
* Instantiates a new journal iterator.
|
||||
*
|
||||
* @param request
|
||||
* the request
|
||||
*/
|
||||
public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) {
|
||||
this.request = request;
|
||||
this.setProjectCordaId(projectCordaId);
|
||||
|
||||
if (info != null) {
|
||||
this.setJournalId(info.getJournalId());
|
||||
this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName()));
|
||||
this.setJournalISSN(info.getJournalISSN());
|
||||
this.setOpenaireDatasource(info.getDatasourceId());
|
||||
}
|
||||
log.debug("Start Iterator");
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute query.
|
||||
*
|
||||
* @param from
|
||||
* the from
|
||||
* @param size
|
||||
* the size
|
||||
* @return the string
|
||||
*/
|
||||
private String executeQuery(final int from, final int size) {
|
||||
log.debug("executing query " + this.request.getQuery().getTerm());
|
||||
log.debug(String.format("from:%d size:%d", from, size));
|
||||
CloseableHttpResponse response = null;
|
||||
InputStream responseBody = null;
|
||||
CloseableHttpClient httpclient = HttpClients.createDefault();
|
||||
try {
|
||||
|
||||
HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from));
|
||||
Gson g = new GsonBuilder().disableHtmlEscaping().create();
|
||||
StringEntity entry = new StringEntity(g.toJson(this.request));
|
||||
post.setEntity(entry);
|
||||
long start = System.currentTimeMillis();
|
||||
response = httpclient.execute(post);
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
if (statusCode == 200) {
|
||||
responseBody = response.getEntity().getContent();
|
||||
String s = IOUtils.toString(responseBody);
|
||||
log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms");
|
||||
responseBody.close();
|
||||
return s;
|
||||
}
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
log.error("Error on executing query :" + request.getQuery().getTerm(), e);
|
||||
return null;
|
||||
} finally {
|
||||
try {
|
||||
responseBody.close();
|
||||
response.close();
|
||||
httpclient.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Can't close connections gracefully", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the journal id.
|
||||
*
|
||||
* @return the journalId
|
||||
*/
|
||||
public String getJournalId() {
|
||||
return journalId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the journal id.
|
||||
*
|
||||
* @param journalId
|
||||
* the journalId to set
|
||||
*/
|
||||
public void setJournalId(final String journalId) {
|
||||
this.journalId = journalId;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (from + currentIterator) < total;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse
|
||||
.getXmlRecords().get(currentIterator));
|
||||
currentIterator++;
|
||||
if (currentIterator == DEFAULT_SIZE) {
|
||||
getNextItem();
|
||||
}
|
||||
return xml;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Iterator#remove()
|
||||
*/
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
from = 0;
|
||||
total = 0;
|
||||
getNextItem();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next item.
|
||||
*
|
||||
* @return the next item
|
||||
*/
|
||||
private void getNextItem() {
|
||||
from += currentIterator;
|
||||
currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE));
|
||||
total = currentResponse == null ? 0 : currentResponse.getTotal();
|
||||
log.debug("from : " + from + " total of the request is " + total);
|
||||
currentIterator = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the projectCordaId
|
||||
*/
|
||||
public String getProjectCordaId() {
|
||||
return projectCordaId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param projectCordaId
|
||||
* the projectCordaId to set
|
||||
*/
|
||||
public void setProjectCordaId(final String projectCordaId) {
|
||||
this.projectCordaId = projectCordaId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the journalName
|
||||
*/
|
||||
public String getJournalName() {
|
||||
return journalName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param journalName
|
||||
* the journalName to set
|
||||
*/
|
||||
public void setJournalName(final String journalName) {
|
||||
this.journalName = journalName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the journalISSN
|
||||
*/
|
||||
public String getJournalISSN() {
|
||||
return journalISSN;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param journalISSN
|
||||
* the journalISSN to set
|
||||
*/
|
||||
public void setJournalISSN(final String journalISSN) {
|
||||
this.journalISSN = journalISSN;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the openaireDatasource
|
||||
*/
|
||||
public String getOpenaireDatasource() {
|
||||
return openaireDatasource;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param openaireDatasource
|
||||
* the openaireDatasource to set
|
||||
*/
|
||||
public void setOpenaireDatasource(final String openaireDatasource) {
|
||||
this.openaireDatasource = openaireDatasource;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.gson.JsonArray;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
public class ElasticSearchResponse {
|
||||
|
||||
/** The logger. */
|
||||
private static final Log log = LogFactory.getLog(ElasticSearchResponse.class);
|
||||
private long total;
|
||||
private List<String> xmlRecords;
|
||||
|
||||
public static ElasticSearchResponse createNewResponse(final String response) {
|
||||
ElasticSearchResponse item = new ElasticSearchResponse();
|
||||
|
||||
if (response == null) {
|
||||
log.fatal("Error: null elasticsearch reponse");
|
||||
return null;
|
||||
|
||||
}
|
||||
JsonElement jElement = new JsonParser().parse(response);
|
||||
JsonObject jobject = jElement.getAsJsonObject();
|
||||
if (jobject.has("hits")) {
|
||||
|
||||
item.setTotal(jobject.get("hits").getAsJsonObject().get("total").getAsLong());
|
||||
|
||||
JsonElement hits = ((JsonObject) jobject.get("hits")).get("hits");
|
||||
|
||||
JsonArray hitsObject = hits.getAsJsonArray();
|
||||
|
||||
List<String> records = new ArrayList<String>();
|
||||
|
||||
for (JsonElement elem : hitsObject) {
|
||||
JsonObject _source = (JsonObject) ((JsonObject) elem).get("_source");
|
||||
String xml = _source.get("xml").getAsString();
|
||||
records.add(xml);
|
||||
}
|
||||
item.setXmlRecords(records);
|
||||
return item;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the xmlRecords
|
||||
*/
|
||||
public List<String> getXmlRecords() {
|
||||
return xmlRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param xmlRecords
|
||||
* the xmlRecords to set
|
||||
*/
|
||||
public void setXmlRecords(final List<String> xmlRecords) {
|
||||
this.xmlRecords = xmlRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the total
|
||||
*/
|
||||
public long getTotal() {
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param total
|
||||
* the total to set
|
||||
*/
|
||||
public void setTotal(final long total) {
|
||||
this.total = total;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
/**
|
||||
* The Class PangaeaJorunalInfo.
|
||||
*/
|
||||
public class PangaeaJournalInfo {
|
||||
|
||||
/** The journal name. */
|
||||
private String journalName;
|
||||
|
||||
/** The journal id. */
|
||||
private String journalId;
|
||||
|
||||
/** The datasource id. */
|
||||
private String datasourceId;
|
||||
|
||||
/** The journal issn. */
|
||||
private String journalISSN;
|
||||
|
||||
/**
|
||||
* Gets the journal name.
|
||||
*
|
||||
* @return the journal name
|
||||
*/
|
||||
public String getJournalName() {
|
||||
return journalName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the journal name.
|
||||
*
|
||||
* @param journalName
|
||||
* the new journal name
|
||||
*/
|
||||
public void setJournalName(final String journalName) {
|
||||
this.journalName = journalName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the journal id.
|
||||
*
|
||||
* @return the journal id
|
||||
*/
|
||||
public String getJournalId() {
|
||||
return journalId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the journal id.
|
||||
*
|
||||
* @param journalId
|
||||
* the new journal id
|
||||
*/
|
||||
public void setJournalId(final String journalId) {
|
||||
this.journalId = journalId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the datasource id.
|
||||
*
|
||||
* @return the datasource id
|
||||
*/
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the datasource id.
|
||||
*
|
||||
* @param datasourceId
|
||||
* the new datasource id
|
||||
*/
|
||||
public void setDatasourceId(final String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the journalISSN
|
||||
*/
|
||||
public String getJournalISSN() {
|
||||
return journalISSN;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param journalISSN
|
||||
* the journalISSN to set
|
||||
*/
|
||||
public void setJournalISSN(final String journalISSN) {
|
||||
this.journalISSN = journalISSN;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class QueryField {
|
||||
|
||||
private Map<String, String> term;
|
||||
|
||||
public QueryField() {
|
||||
setTerm(new HashMap<String, String>());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the term
|
||||
*/
|
||||
public Map<String, String> getTerm() {
|
||||
return term;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param term
|
||||
* the term to set
|
||||
*/
|
||||
public void setTerm(final Map<String, String> term) {
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasets;
|
||||
|
||||
public class RequestField {
|
||||
|
||||
private QueryField query;
|
||||
|
||||
/**
|
||||
* @return the query
|
||||
*/
|
||||
public QueryField getQuery() {
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param query the query to set
|
||||
*/
|
||||
public void setQuery(QueryField query) {
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasources;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
/**
|
||||
* Plugin to collect metadata record about data repositories from re3data.
|
||||
* <p>
|
||||
* Documentation on re3data API: http://service.re3data.org/api/doc.
|
||||
* </p>
|
||||
* <p>
|
||||
* BaseURL: http://service.re3data.org
|
||||
* </p>
|
||||
* <p>
|
||||
* API to get the list of repos: baseURL + /api/v1/repositories
|
||||
* </p>
|
||||
* <p>
|
||||
* API to get a repository: baseURL + content of link/@href of the above list
|
||||
* </p>
|
||||
*
|
||||
* @author alessia
|
||||
*
|
||||
*/
|
||||
public class Re3DataCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private String repositoryListPath = "/api/v1/repositories";
|
||||
|
||||
@Autowired
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
String repositoryListURL = interfaceDescriptor.getBaseUrl() + repositoryListPath;
|
||||
String input;
|
||||
try {
|
||||
input = httpConnector.getInputSource(repositoryListURL);
|
||||
return new Re3DataRepositoriesIterator(IOUtils.toInputStream(input, "UTF-8"), interfaceDescriptor.getBaseUrl(), getHttpConnector());
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public String getRepositoryListPath() {
|
||||
return repositoryListPath;
|
||||
}
|
||||
|
||||
public void setRepositoryListPath(final String repositoryListPath) {
|
||||
this.repositoryListPath = repositoryListPath;
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
public void setHttpConnector(final HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
package eu.dnetlib.data.collector.plugins.datasources;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLStreamConstants;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.XMLStreamReader;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
|
||||
public class Re3DataRepositoriesIterator implements Iterator<String>, Iterable<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(Re3DataRepositoriesIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private String baseURL;
|
||||
private XMLStreamReader reader;
|
||||
private int countedRepos = 0;
|
||||
private String currentRepoPath = null;
|
||||
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentRepoPath != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (currentRepoPath == null) throw new NoSuchElementException();
|
||||
|
||||
try {
|
||||
String repoInfo = getRepositoryInfo(currentRepoPath);
|
||||
return repoInfo;
|
||||
} finally {
|
||||
currentRepoPath = moveToNextRepo();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Re3DataRepositoriesIterator(final InputStream xmlInputStream, final String baseUrl, final HttpConnector httpConnector) throws CollectorServiceException {
|
||||
this.httpConnector = httpConnector;
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
try {
|
||||
reader = factory.createXMLStreamReader(xmlInputStream);
|
||||
} catch (XMLStreamException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
baseURL = baseUrl;
|
||||
|
||||
// try to fetch the 1st
|
||||
currentRepoPath = moveToNextRepo();
|
||||
}
|
||||
|
||||
private String getNextRepositoryPath() {
|
||||
return reader.getAttributeValue(null, "href");
|
||||
}
|
||||
|
||||
private String moveToNextRepo() {
|
||||
try {
|
||||
while (reader.hasNext()) {
|
||||
int event = reader.next();
|
||||
if (event == XMLStreamConstants.START_ELEMENT) {
|
||||
String elementName = reader.getLocalName();
|
||||
if (elementName.equals("link")) {
|
||||
String repoPath = getNextRepositoryPath();
|
||||
log.debug(String.format("Found %s repositories. The last has link %s", ++countedRepos, repoPath));
|
||||
return repoPath;
|
||||
}
|
||||
}
|
||||
}
|
||||
log.info("Seems there are no more repository to iterate on. Total: " + countedRepos);
|
||||
return null;
|
||||
} catch (XMLStreamException e) {
|
||||
throw new CollectorServiceRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String getRepositoryInfo(final String repositoryPath) throws CollectorServiceRuntimeException {
|
||||
|
||||
String targetURL = repositoryPath;
|
||||
if(!repositoryPath.startsWith(baseURL))
|
||||
targetURL = baseURL + repositoryPath;
|
||||
try {
|
||||
log.info(targetURL);
|
||||
String inputSource = getHttpConnector().getInputSource(targetURL);
|
||||
|
||||
return XmlCleaner.cleanAllEntities(inputSource);
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new CollectorServiceRuntimeException("OOOPS something bad happen getting repo info from " + targetURL, e);
|
||||
}
|
||||
}
|
||||
|
||||
// public String testAccess(){
|
||||
// return getRepositoryInfo("/api/v1/repository/r3d100012823");
|
||||
// }
|
||||
public String getBaseURL() {
|
||||
return baseURL;
|
||||
}
|
||||
|
||||
public void setBaseURL(final String baseURL) {
|
||||
this.baseURL = baseURL;
|
||||
}
|
||||
|
||||
public int getCountedRepos() {
|
||||
return countedRepos;
|
||||
}
|
||||
|
||||
public void setCountedRepos(final int countedRepos) {
|
||||
this.countedRepos = countedRepos;
|
||||
}
|
||||
|
||||
public XMLStreamReader getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
public void setReader(final XMLStreamReader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
public String getCurrentRepoPath() {
|
||||
return currentRepoPath;
|
||||
}
|
||||
|
||||
public void setCurrentRepoPath(final String currentRepoPath) {
|
||||
this.currentRepoPath = currentRepoPath;
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
public void setHttpConnector(final HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package eu.dnetlib.data.collector.plugins.excel;
|
||||
|
||||
/**
|
||||
* Created by miriam on 10/05/2017.
|
||||
*/
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.ArrayList;
|
||||
import org.apache.commons.csv.CSVPrinter;
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
|
||||
public class CSVFileWriter {
|
||||
private static final String NEW_LINE_SEPARATOR = "\n";
|
||||
|
||||
private Object [] file_header ;
|
||||
private ArrayList<ArrayList<String>> projects = new ArrayList<ArrayList<String>>();
|
||||
|
||||
public void setHeader(String[] header){
|
||||
this.file_header = header;
|
||||
}
|
||||
|
||||
public void addProject(ArrayList<String> project) {
|
||||
projects.add(project);
|
||||
|
||||
}
|
||||
|
||||
public void writeFile(String csv_file_path){
|
||||
BufferedWriter writer = null;
|
||||
CSVPrinter csvFilePrinter = null;
|
||||
|
||||
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR);
|
||||
|
||||
try{
|
||||
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv_file_path),"UTF-8"));
|
||||
|
||||
csvFilePrinter = new CSVPrinter(writer,csvFileFormat);
|
||||
csvFilePrinter.printRecord(file_header);
|
||||
|
||||
for(ArrayList<String> project:projects){
|
||||
csvFilePrinter.printRecord(project);
|
||||
}
|
||||
}catch(Exception e){
|
||||
e.printStackTrace();
|
||||
}finally{
|
||||
try{
|
||||
writer.flush();
|
||||
writer.close();
|
||||
csvFilePrinter.close();
|
||||
}catch(IOException ioe){
|
||||
ioe.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
package eu.dnetlib.data.collector.plugins.excel;
|
||||
|
||||
/**
|
||||
* Created by miriam on 10/05/2017.
|
||||
*/
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.ss.usermodel.Sheet;
|
||||
import org.apache.poi.ss.usermodel.Workbook;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.json.*;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
|
||||
public class Read {
|
||||
|
||||
private static final Log log = LogFactory.getLog(Read.class);
|
||||
|
||||
/** The descriptor. */
|
||||
private InterfaceDescriptor descriptor;
|
||||
|
||||
|
||||
/*private final String EXCEL_FILE_URL ="https://pf.fwf.ac.at/en/research-in-practice/project-finder.xlsx?&&&search%5Bcall%5D=&search%5Bdecision_board_ids%5D=&search%5Bend_date%5D=&search%5Binstitute_name%5D=&search%5Blead_firstname%5D=&search%5Blead_lastname%5D=&search%5Bper_page%5D=10&search%5Bproject_number%5D=&search%5Bproject_title%5D=&search%5Bscience_discipline_id%5D=&search%5Bstart_date%5D=&search%5Bstatus_id%5D=&search%5Bwhat%5D=&action=index&controller=projects&locale=en&per_page=10";
|
||||
private final String CSV_FILE_PATH = "//Users//miriam//Documents//svn//mirima//FWF//projects_search2017.05.09.5.csv";
|
||||
private final String argument = "{\"replace\":{\"header\":[{\"from\":\"&\",\"to\":\"and\"}],\"body\":[{\"from\":\"\\n\",\"to\":\" \"}]}," +
|
||||
"\"replace_currency\":[{\"from\":\"$\",\"to\":\"€\"}],"
|
||||
+ "\"col_currency\":10}"; */
|
||||
private Sheet sheet;
|
||||
private CSVFileWriter csv_writer = new CSVFileWriter();
|
||||
private HashMap<String,String> map_header = new HashMap<String,String>();
|
||||
private HashMap<String,String> map_body = new HashMap<String,String>();
|
||||
private int header_row;
|
||||
private String file_to_save ;
|
||||
private boolean replace_currency = false;
|
||||
private String from_currency, to_currency;
|
||||
private boolean remove_empty, remove_tmp_file;
|
||||
private String remove_id;
|
||||
private int column_id;
|
||||
private int currency_column;
|
||||
private int sheet_number;
|
||||
private String tmp_file;
|
||||
private String argument;
|
||||
private String identifier;
|
||||
|
||||
private HttpCSVCollectorPlugin collector;
|
||||
|
||||
public HttpCSVCollectorPlugin getCollector() {
|
||||
return collector;
|
||||
}
|
||||
|
||||
public void setCollector(HttpCSVCollectorPlugin collector) {
|
||||
this.collector = collector;
|
||||
}
|
||||
|
||||
public Read(InterfaceDescriptor descriptor){
|
||||
this.descriptor = descriptor;
|
||||
|
||||
}
|
||||
|
||||
private static String getCellValue( Cell cell)
|
||||
{
|
||||
DataFormatter formatter = new DataFormatter();
|
||||
String formattedCellValue = formatter.formatCellValue(cell);
|
||||
return formattedCellValue;
|
||||
|
||||
}
|
||||
|
||||
private void copyFile() throws IOException{
|
||||
FileUtils.copyURLToFile(new URL(descriptor.getBaseUrl()), new File(tmp_file));
|
||||
|
||||
}
|
||||
|
||||
private void parseDescriptor(){
|
||||
HashMap<String, String> params = descriptor.getParams();
|
||||
argument = params.get("argument");
|
||||
header_row = Integer.parseInt(params.get("header_row"));
|
||||
tmp_file = params.get("tmp_file");
|
||||
remove_empty = (params.get("remove_empty_lines") == "yes");
|
||||
remove_id = params.get("remove_lines_with_id");
|
||||
column_id = Integer.parseInt(params.get("col_id"));
|
||||
remove_tmp_file = (params.get("remove_tmp_file") == "yes");
|
||||
sheet_number = Integer.parseInt(params.get("sheet_number"));
|
||||
file_to_save = params.get("file_to_save");
|
||||
}
|
||||
private void init() throws IOException{
|
||||
parseDescriptor();
|
||||
log.info("Parsing the arguments");
|
||||
parseArguments();
|
||||
log.info("Copying the file in temp local file");
|
||||
copyFile();
|
||||
log.info("Extracting the sheet " + sheet_number);
|
||||
FileInputStream fis = new FileInputStream(tmp_file);
|
||||
Workbook workbook = new XSSFWorkbook(fis);
|
||||
sheet = workbook.getSheetAt(sheet_number);
|
||||
fis.close();
|
||||
if(remove_tmp_file) {
|
||||
File f = new File(tmp_file);
|
||||
f.delete();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void fillMap(JSONObject json, HashMap<String,String> map, String elem){
|
||||
try{
|
||||
final JSONArray arr = json.getJSONObject("replace").getJSONArray(elem);
|
||||
for(Object entry: arr)
|
||||
map.put(((JSONObject)entry).getString("from"), ((JSONObject)entry).getString("to"));
|
||||
}catch(Throwable e){
|
||||
log.error("Problems filling the map for " + elem);
|
||||
throw(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void parseArguments() {
|
||||
if (StringUtils.isNotEmpty(argument)){
|
||||
try{
|
||||
final JSONObject json = new JSONObject(argument);
|
||||
if(json.has("header"))
|
||||
fillMap(json, map_header,"header");
|
||||
if (json.has("body"))
|
||||
fillMap(json,map_body,"body");
|
||||
|
||||
if(json.has("replace_currency"))
|
||||
{
|
||||
replace_currency = true ;
|
||||
from_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("from");
|
||||
to_currency = json.getJSONArray("replace_currency").getJSONObject(0).getString("to");
|
||||
|
||||
}
|
||||
|
||||
if (json.has("col_currency"))
|
||||
currency_column = json.getInt("col_currency");
|
||||
}catch(Throwable e){
|
||||
log.error("Problems while parsing the argument parameter.");
|
||||
throw (e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
private String applyReplace(String row, HashMap<String,String>replace){
|
||||
for(String key: replace.keySet()){
|
||||
if(row.contains(key))
|
||||
row = row.replace(key, replace.get(key));
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
private void getHeader(){
|
||||
Row row = sheet.getRow(header_row);
|
||||
Iterator<Cell> cellIterator = row.cellIterator();
|
||||
Cell cell;
|
||||
String project = "";
|
||||
int count = 0;
|
||||
while (cellIterator.hasNext()){
|
||||
cell = cellIterator.next();
|
||||
final String stringCellValue = cell.getStringCellValue();
|
||||
project += applyReplace(stringCellValue,map_header) + ";";
|
||||
if(count++ == column_id) identifier = applyReplace(stringCellValue,map_header);
|
||||
}
|
||||
project = project.substring(0, project.length() -1 );
|
||||
csv_writer.setHeader(project.split(";"));
|
||||
|
||||
}
|
||||
|
||||
private void getData(){
|
||||
Row row;
|
||||
Cell cell;
|
||||
String tmp;
|
||||
Iterator<Cell>cellIterator;
|
||||
for(int row_number = header_row + 1; row_number < sheet.getLastRowNum(); row_number++){
|
||||
row = sheet.getRow(row_number);
|
||||
if (row != null) {
|
||||
cellIterator = row.cellIterator();
|
||||
|
||||
int col_number = 0;
|
||||
|
||||
boolean discard_row = false;
|
||||
ArrayList<String> al = new ArrayList<String>();
|
||||
while (cellIterator.hasNext() && !discard_row) {
|
||||
cell = cellIterator.next();
|
||||
tmp = getCellValue(cell).trim();
|
||||
tmp = tmp.replace("\n"," ");
|
||||
if (col_number == column_id &&
|
||||
((remove_empty && tmp.trim().equals("")) ||
|
||||
(!remove_id.equals("") && tmp.equals(remove_id))))
|
||||
discard_row = true;
|
||||
|
||||
if (replace_currency && col_number == currency_column)
|
||||
tmp = tmp.replace(from_currency, to_currency);
|
||||
|
||||
al.add(applyReplace(tmp, map_body));
|
||||
col_number++;
|
||||
}
|
||||
if (!discard_row) {
|
||||
csv_writer.addProject(al);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void writeCSVFile(){
|
||||
|
||||
csv_writer.writeFile(file_to_save);
|
||||
}
|
||||
|
||||
private InterfaceDescriptor prepareHTTPCSVDescriptor(){
|
||||
InterfaceDescriptor dex = new InterfaceDescriptor();
|
||||
dex.setBaseUrl("file://"+file_to_save);
|
||||
HashMap<String, String> params = new HashMap<String, String>();
|
||||
params.put("separator", descriptor.getParams().get("separator"));
|
||||
params.put("identifier",identifier);
|
||||
params.put("quote",descriptor.getParams().get("quote"));
|
||||
dex.setParams(params);
|
||||
return dex;
|
||||
}
|
||||
|
||||
public Iterable<String> parseFile() throws Exception{
|
||||
|
||||
|
||||
init();
|
||||
log.info("Getting header elements");
|
||||
getHeader();
|
||||
log.info("Getting sheet data");
|
||||
getData();
|
||||
log.info("Writing the csv file");
|
||||
writeCSVFile();
|
||||
log.info("Preparing to parse csv");
|
||||
|
||||
return collector.collect(prepareHTTPCSVDescriptor(),"","");
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package eu.dnetlib.data.collector.plugins.excel;
|
||||
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.HttpCSVCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
/**
|
||||
* Created by miriam on 10/05/2017.
|
||||
*/
|
||||
public class ReadExcelPlugin extends AbstractCollectorPlugin{
|
||||
|
||||
private static final Log log = LogFactory.getLog(ReadExcelPlugin.class);
|
||||
@Autowired
|
||||
HttpCSVCollectorPlugin httpCSVCollectorPlugin;
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
Read r = new Read(interfaceDescriptor);
|
||||
r.setCollector(httpCSVCollectorPlugin);
|
||||
|
||||
try {
|
||||
return r.parseFile();
|
||||
}catch(Exception e){
|
||||
log.error("Error importing excel file");
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
/**
|
||||
*
|
||||
*/
|
||||
package eu.dnetlib.data.collector.plugins.filesfrommetadata;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
|
||||
/**
|
||||
* @author sandro
|
||||
*
|
||||
*/
|
||||
public class FilesFromMetadataCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @see eu.dnetlib.data.collector.plugin.CollectorPlugin#collect(eu.dnetlib.data.collector.rmi.InterfaceDescriptor, java.lang.String, java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor arg0, final String arg1, final String arg2) throws CollectorServiceException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.data.collector.plugins.filesfrommetadata;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.data.collector.functions.ParamValuesFunction;
|
||||
import eu.dnetlib.data.collector.rmi.ProtocolParameterValue;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.enabling.locators.UniqueServiceLocator;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
|
||||
/**
|
||||
* Created by alessia on 17/12/15.
|
||||
*/
|
||||
public class PopulateFileDownloadBasePath implements ParamValuesFunction {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PopulateFileDownloadBasePath.class);
|
||||
@Autowired
|
||||
private UniqueServiceLocator serviceLocator;
|
||||
|
||||
@Value("${services.objectstore.basePathList.xquery}")
|
||||
private String xQueryForObjectStoreBasePath;
|
||||
|
||||
@Override
|
||||
public List<ProtocolParameterValue> findValues(final String s, final Map<String, String> map) {
|
||||
try {
|
||||
return Lists.transform(serviceLocator.getService(ISLookUpService.class).quickSearchProfile(xQueryForObjectStoreBasePath),
|
||||
new Function<String, ProtocolParameterValue>() {
|
||||
@Override
|
||||
public ProtocolParameterValue apply(final String s) {
|
||||
return new ProtocolParameterValue(s, s);
|
||||
}
|
||||
});
|
||||
} catch (ISLookUpException e) {
|
||||
log.error("Cannot read Object store service properties", e);
|
||||
}
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
public UniqueServiceLocator getServiceLocator() {
|
||||
return serviceLocator;
|
||||
}
|
||||
|
||||
public void setServiceLocator(final UniqueServiceLocator serviceLocator) {
|
||||
this.serviceLocator = serviceLocator;
|
||||
}
|
||||
|
||||
public String getxQueryForObjectStoreBasePath() {
|
||||
return xQueryForObjectStoreBasePath;
|
||||
}
|
||||
|
||||
public void setxQueryForObjectStoreBasePath(final String xQueryForObjectStoreBasePath) {
|
||||
this.xQueryForObjectStoreBasePath = xQueryForObjectStoreBasePath;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package eu.dnetlib.data.collector.plugins.filesystem;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
/**
|
||||
* Class enabling lazy and recursive iteration of a filesystem tree. The iterator iterates over file paths.
|
||||
*
|
||||
* @author Andrea
|
||||
*
|
||||
*/
|
||||
public class FileSystemIterator implements Iterator<String> {
|
||||
|
||||
/** The logger */
|
||||
private static final Log log = LogFactory.getLog(FileSystemIterator.class);
|
||||
|
||||
private Set<String> extensions = Sets.newHashSet();
|
||||
private Iterator<Path> pathIterator;
|
||||
private String current;
|
||||
|
||||
public FileSystemIterator(final String baseDir, final String extensions) {
|
||||
if(StringUtils.isNotBlank(extensions)) {
|
||||
this.extensions = Sets.newHashSet(extensions.split(","));
|
||||
}
|
||||
try {
|
||||
this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator();
|
||||
this.current = walkTillNext();
|
||||
} catch (IOException e) {
|
||||
log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir);
|
||||
throw new RuntimeException("Filesystem collection error.", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return current != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized String next() {
|
||||
String pivot = new String(current);
|
||||
current = walkTillNext();
|
||||
log.debug("Returning: " + pivot);
|
||||
return pivot;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
/**
|
||||
* Walk the filesystem recursively until it finds a candidate. Strategies: a) For any directory found during the walk, an iterator is
|
||||
* built and concat to the main one; b) Any file is checked against admitted extensions
|
||||
*
|
||||
* @return the next element to be returned by next call of this.next()
|
||||
*/
|
||||
private synchronized String walkTillNext() {
|
||||
while (pathIterator.hasNext()) {
|
||||
Path nextFilePath = pathIterator.next();
|
||||
if (Files.isDirectory(nextFilePath)) {
|
||||
// concat
|
||||
try {
|
||||
pathIterator = Iterators.concat(pathIterator, Files.newDirectoryStream(nextFilePath).iterator());
|
||||
log.debug("Adding folder iterator: " + nextFilePath.toString());
|
||||
} catch (IOException e) {
|
||||
log.error("Cannot create folder iterator! Is this path correct? " + nextFilePath.toString());
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
if (extensions.isEmpty() || extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) {
|
||||
log.debug("Returning: " + nextFilePath.toString());
|
||||
return nextFilePath.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package eu.dnetlib.data.collector.plugins.filesystem;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author andrea
|
||||
*
|
||||
*/
|
||||
public class FilesystemCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
return new FilesystemIterable(interfaceDescriptor);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
package eu.dnetlib.data.collector.plugins.filesystem;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.ximpleware.*;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.JSONObject;
|
||||
import org.json.XML;
|
||||
|
||||
/**
|
||||
* The Class FilesystemIterable.
|
||||
*
|
||||
* @author Sandro, Michele, Andrea
|
||||
*/
|
||||
public class FilesystemIterable implements Iterable<String> {
|
||||
|
||||
/**
|
||||
* The Constant log.
|
||||
*/
|
||||
private static final Log log = LogFactory.getLog(FilesystemIterable.class);
|
||||
|
||||
/**
|
||||
* The base dir.
|
||||
*/
|
||||
private File baseDir;
|
||||
|
||||
/**
|
||||
* The extensions.
|
||||
*/
|
||||
private String extensions;
|
||||
|
||||
/**
|
||||
* File format (json / xml)
|
||||
**/
|
||||
private String fileFormat = "xml";
|
||||
|
||||
private List<String> supportedFormats = Lists.newArrayList("xml", "json");
|
||||
|
||||
private boolean setObjIdentifierFromFileName = false;
|
||||
|
||||
/**
|
||||
* Instantiates a new filesystem iterable.
|
||||
*
|
||||
* @param descriptor the descriptor
|
||||
* @throws CollectorServiceException the collector service exception
|
||||
*/
|
||||
public FilesystemIterable(final InterfaceDescriptor descriptor) throws CollectorServiceException {
|
||||
try {
|
||||
final String baseUrl = descriptor.getBaseUrl();
|
||||
URL basePath = new URL(baseUrl);
|
||||
this.baseDir = new File(basePath.getPath());
|
||||
if (!baseDir.exists()) { throw new CollectorServiceException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
|
||||
this.extensions = descriptor.getParams().get("extensions");
|
||||
if (descriptor.getParams().containsKey("fileFormat")) fileFormat = descriptor.getParams().get("fileFormat");
|
||||
if (!supportedFormats.contains(fileFormat))
|
||||
throw new CollectorServiceException("File format " + fileFormat + " not supported. Supported formats are: " + StringUtils
|
||||
.join(supportedFormats, ','));
|
||||
if (descriptor.getParams().containsKey("setObjIdentifierFromFileName")) {
|
||||
setObjIdentifierFromFileName = Boolean.parseBoolean(descriptor.getParams().get("setObjIdentifierFromFileName"));
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
throw new CollectorServiceException("Filesystem collector failed! ", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extensions);
|
||||
return Iterators.transform(fsi, inputFileName -> {
|
||||
FileInputStream fileInputStream = null;
|
||||
try {
|
||||
fileInputStream = new FileInputStream(inputFileName);
|
||||
final String s = IOUtils.toString(fileInputStream);
|
||||
if (fileFormat.equalsIgnoreCase("json")) {
|
||||
JSONObject json = new JSONObject(s);
|
||||
JSONObject obj = new JSONObject();
|
||||
if (setObjIdentifierFromFileName) {
|
||||
obj.put("header", new JSONObject().put("objIdentifier", FilenameUtils.getBaseName(inputFileName)));
|
||||
}
|
||||
obj.put("metadata", json);
|
||||
log.debug(obj.toString());
|
||||
return XML.toString(obj, "record");
|
||||
}
|
||||
String cleanedXML = XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
|
||||
if (setObjIdentifierFromFileName) {
|
||||
return addObjIdentifier(cleanedXML, FilenameUtils.getBaseName(inputFileName));
|
||||
} else return cleanedXML;
|
||||
} catch (VTDException e) {
|
||||
log.error("Cannot process with VTD to set the objIdentifier " + inputFileName);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
log.error("Unable to read " + inputFileName);
|
||||
return "";
|
||||
} finally {
|
||||
if (fileInputStream != null) {
|
||||
try {
|
||||
fileInputStream.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Unable to close inputstream for " + inputFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private String addObjIdentifier(String xml, String objidentifier) throws VTDException, IOException {
|
||||
VTDGen vg = new VTDGen(); // Instantiate VTDGen
|
||||
XMLModifier xm = new XMLModifier(); //Instantiate XMLModifier
|
||||
vg.setDoc(xml.getBytes("UTF-8"));
|
||||
vg.parse(false);
|
||||
VTDNav vn = vg.getNav();
|
||||
xm.bind(vn);
|
||||
if (vn.toElement(VTDNav.ROOT)) {
|
||||
xm.insertBeforeElement("<record><header><objIdentifier>" + objidentifier + "</objIdentifier></header><metadata>");
|
||||
xm.insertAfterElement("</metadata></record>");
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
xm.output(baos);
|
||||
return baos.toString("UTF-8");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.data.collector.plugins.ftp;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Author: Andrea Mannocci
|
||||
*
|
||||
*/
|
||||
public class FtpCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private FtpIteratorFactory ftpIteratorFactory;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
final String username = interfaceDescriptor.getParams().get("username");
|
||||
final String password = interfaceDescriptor.getParams().get("password");
|
||||
final String recursive = interfaceDescriptor.getParams().get("recursive");
|
||||
final String extensions = interfaceDescriptor.getParams().get("extensions");
|
||||
|
||||
if ((baseUrl == null) || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
if ((username == null) || username.isEmpty()) { throw new CollectorServiceException("Param 'username' is null or empty"); }
|
||||
if ((password == null) || password.isEmpty()) { throw new CollectorServiceException("Param 'password' is null or empty"); }
|
||||
if ((recursive == null) || recursive.isEmpty()) { throw new CollectorServiceException("Param 'recursive' is null or empty"); }
|
||||
if ((extensions == null) || extensions.isEmpty()) { throw new CollectorServiceException("Param 'extensions' is null or empty"); }
|
||||
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
|
||||
|
||||
return new Iterable<String>() {
|
||||
|
||||
boolean isRecursive = "true".equals(recursive);
|
||||
|
||||
Set<String> extensionsSet = parseSet(extensions);
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return getFtpIteratorFactory().newIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
|
||||
}
|
||||
|
||||
private Set<String> parseSet(final String extensions) {
|
||||
return Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(extensions));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public FtpIteratorFactory getFtpIteratorFactory() {
|
||||
return ftpIteratorFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setFtpIteratorFactory(final FtpIteratorFactory ftpIteratorFactory) {
|
||||
this.ftpIteratorFactory = ftpIteratorFactory;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
package eu.dnetlib.data.collector.plugins.ftp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.commons.net.ftp.FTPClient;
|
||||
import org.apache.commons.net.ftp.FTPFile;
|
||||
import org.apache.commons.net.ftp.FTPReply;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Author: Andrea Mannocci
|
||||
*
|
||||
*/
|
||||
public class FtpIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(FtpIterator.class);
|
||||
|
||||
private static final int MAX_RETRIES = 5;
|
||||
private static final int DEFAULT_TIMEOUT = 30000;
|
||||
private static final long BACKOFF_MILLIS = 10000;
|
||||
|
||||
private FTPClient ftpClient;
|
||||
private String ftpServerAddress;
|
||||
private String remoteFtpBasePath;
|
||||
private String username;
|
||||
private String password;
|
||||
private boolean isRecursive;
|
||||
private Set<String> extensionsSet;
|
||||
private boolean incremental;
|
||||
private DateTime fromDate = null;
|
||||
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
|
||||
private Queue<String> queue;
|
||||
|
||||
public FtpIterator(final String baseUrl, final String username, final String password, final boolean isRecursive,
|
||||
final Set<String> extensionsSet, String fromDate) {
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
this.isRecursive = isRecursive;
|
||||
this.extensionsSet = extensionsSet;
|
||||
this.incremental = StringUtils.isNotBlank(fromDate);
|
||||
if (incremental) {
|
||||
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
|
||||
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
|
||||
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
|
||||
}
|
||||
try {
|
||||
URL server = new URL(baseUrl);
|
||||
this.ftpServerAddress = server.getHost();
|
||||
this.remoteFtpBasePath = server.getPath();
|
||||
} catch (MalformedURLException e1) {
|
||||
throw new CollectorServiceRuntimeException("Malformed URL exception " + baseUrl);
|
||||
}
|
||||
|
||||
connectToFtpServer();
|
||||
initializeQueue();
|
||||
}
|
||||
|
||||
private void connectToFtpServer() {
|
||||
ftpClient = new FTPClient();
|
||||
ftpClient.setDefaultTimeout(DEFAULT_TIMEOUT);
|
||||
ftpClient.setDataTimeout(DEFAULT_TIMEOUT);
|
||||
ftpClient.setConnectTimeout(DEFAULT_TIMEOUT);
|
||||
try {
|
||||
ftpClient.connect(ftpServerAddress);
|
||||
|
||||
// try to login
|
||||
if (!ftpClient.login(username, password)) {
|
||||
ftpClient.logout();
|
||||
throw new CollectorServiceRuntimeException("Unable to login to FTP server " + ftpServerAddress);
|
||||
}
|
||||
int reply = ftpClient.getReplyCode();
|
||||
if (!FTPReply.isPositiveCompletion(reply)) {
|
||||
ftpClient.disconnect();
|
||||
throw new CollectorServiceRuntimeException("Unable to connect to FTP server " + ftpServerAddress);
|
||||
}
|
||||
|
||||
ftpClient.enterLocalPassiveMode();
|
||||
log.info("Connected to FTP server " + ftpServerAddress);
|
||||
log.info(String.format("FTP collecting from %s with recursion = %s", remoteFtpBasePath, isRecursive));
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceRuntimeException("Unable to connect to FTP server " + ftpServerAddress);
|
||||
}
|
||||
}
|
||||
|
||||
private void disconnectFromFtpServer() {
|
||||
try {
|
||||
if (ftpClient.isConnected()) {
|
||||
ftpClient.logout();
|
||||
ftpClient.disconnect();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to logout & disconnect from the FTP server", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void initializeQueue() {
|
||||
queue = new LinkedList<String>();
|
||||
listDirectoryRecursive(remoteFtpBasePath, "");
|
||||
}
|
||||
|
||||
private void listDirectoryRecursive(final String parentDir, final String currentDir) {
|
||||
String dirToList = parentDir;
|
||||
if (!currentDir.equals("")) {
|
||||
dirToList += "/" + currentDir;
|
||||
}
|
||||
FTPFile[] subFiles;
|
||||
try {
|
||||
subFiles = ftpClient.listFiles(dirToList);
|
||||
if ((subFiles != null) && (subFiles.length > 0)) {
|
||||
for (FTPFile aFile : subFiles) {
|
||||
String currentFileName = aFile.getName();
|
||||
|
||||
if (currentFileName.equals(".") || currentFileName.equals("..")) {
|
||||
// skip parent directory and directory itself
|
||||
continue;
|
||||
}
|
||||
if (aFile.isDirectory()) {
|
||||
if (isRecursive) {
|
||||
listDirectoryRecursive(dirToList, currentFileName);
|
||||
}
|
||||
} else {
|
||||
// test the file for extensions compliance and, just in case, add it to the list.
|
||||
for (String ext : extensionsSet) {
|
||||
if (currentFileName.endsWith(ext)) {
|
||||
//incremental mode: let's check the last update date
|
||||
if(incremental){
|
||||
Calendar timestamp = aFile.getTimestamp();
|
||||
DateTime lastModificationDate = new DateTime(timestamp);
|
||||
if(lastModificationDate.isAfter(fromDate)){
|
||||
queue.add(dirToList + "/" + currentFileName);
|
||||
log.debug(currentFileName + " has changed and must be re-collected");
|
||||
} else {
|
||||
if (log.isDebugEnabled()) {
|
||||
log.debug(currentFileName + " has not changed since last collection");
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
//not incremental: just add it to the queue
|
||||
queue.add(dirToList + "/" + currentFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceRuntimeException("Unable to list FTP remote folder", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (queue.isEmpty()) {
|
||||
disconnectFromFtpServer();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String nextRemotePath = queue.remove();
|
||||
int nRepeat = 0;
|
||||
while (nRepeat < MAX_RETRIES) {
|
||||
try {
|
||||
OutputStream baos = new ByteArrayOutputStream();
|
||||
if (!ftpClient.isConnected()) {
|
||||
connectToFtpServer();
|
||||
}
|
||||
ftpClient.retrieveFile(nextRemotePath, baos);
|
||||
|
||||
log.debug(String.format("Collected file from FTP: %s%s", ftpServerAddress, nextRemotePath));
|
||||
return baos.toString();
|
||||
} catch (IOException e) {
|
||||
nRepeat++;
|
||||
log.warn(String.format("An error occurred [%s] for %s%s, retrying.. [retried %s time(s)]", e.getMessage(), ftpServerAddress, nextRemotePath,
|
||||
nRepeat));
|
||||
disconnectFromFtpServer();
|
||||
try {
|
||||
Thread.sleep(BACKOFF_MILLIS);
|
||||
} catch (InterruptedException e1) {
|
||||
log.error(e1);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new CollectorServiceRuntimeException(String.format("Impossible to retrieve FTP file %s after %s retries. Aborting FTP collection.", nextRemotePath, nRepeat));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package eu.dnetlib.data.collector.plugins.ftp;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Author: Andrea Mannocci
|
||||
*
|
||||
*/
|
||||
public class FtpIteratorFactory {
|
||||
|
||||
public Iterator<String> newIterator(final String baseUrl,
|
||||
final String username,
|
||||
final String password,
|
||||
final boolean isRecursive,
|
||||
final Set<String> extensionsSet, final String fromDate) {
|
||||
return new FtpIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package eu.dnetlib.data.collector.plugins.httpfilename;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
|
||||
/**
|
||||
* Created by miriam on 07/05/2018.
|
||||
*/
|
||||
public class Connector extends HttpConnector implements ConnectorInterface {
|
||||
private String response;
|
||||
|
||||
@Override
|
||||
public void get(final String requestUrl) throws CollectorServiceException {
|
||||
response = getInputSource(requestUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getResponse() {
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isStatusOk() {
|
||||
return (response != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean responseTypeContains(String string) {
|
||||
String responseType = getResponseType();
|
||||
if (responseType != null)
|
||||
return responseType.contains(string);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.data.collector.plugins.httpfilename;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
/**
|
||||
* Created by miriam on 07/05/2018.
|
||||
*/
|
||||
public interface ConnectorInterface {
|
||||
|
||||
void get(final String requestUrl) throws CollectorServiceException;
|
||||
|
||||
String getResponse();
|
||||
|
||||
boolean isStatusOk();
|
||||
|
||||
|
||||
boolean responseTypeContains(String string);
|
||||
|
||||
}
|
|
@ -0,0 +1,190 @@
|
|||
package eu.dnetlib.data.collector.plugins.httpfilename;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.JSONObject;
|
||||
import org.json.XML;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
/**
|
||||
* Created by miriam on 04/05/2018.
|
||||
*/
|
||||
public class HTTPWithFileNameCollectorIterable implements Iterable<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HTTPWithFileNameCollectorIterable.class);
|
||||
|
||||
private static final String JUNK = "<resource><url>%s</url><DOI>JUNK</DOI></resource>";
|
||||
public static final String APP_JSON = "application/json";
|
||||
public static final String APP_XML = "application/xml";
|
||||
public static final String TEXT_HTML = "text/html";
|
||||
private final ArrayBlockingQueue<String> queue = new ArrayBlockingQueue<String>(100);
|
||||
|
||||
|
||||
|
||||
|
||||
private String filterParam;
|
||||
|
||||
int total = 0;
|
||||
int filtered = 0;
|
||||
|
||||
public HTTPWithFileNameCollectorIterable(String startUrl, String filter){
|
||||
|
||||
this.filterParam = filter;
|
||||
Thread ft = new Thread(new FillMetaQueue(startUrl) );
|
||||
ft.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new HttpWithFileNameCollectorIterator(queue);
|
||||
}
|
||||
|
||||
private class FillMetaQueue implements Runnable {
|
||||
final Connector c = new Connector();
|
||||
|
||||
private final List<String> metas = Collections.synchronizedList(new ArrayList<String>());
|
||||
private final List<String> urls = Collections.synchronizedList(new ArrayList<>());
|
||||
|
||||
public FillMetaQueue(String startUrl){
|
||||
if(!startUrl.isEmpty()){
|
||||
urls.add(startUrl);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void fillQueue() {
|
||||
String url;
|
||||
|
||||
while((metas.size()>0 || urls.size() > 0 )) {
|
||||
log.debug("metas.size() = " + metas.size() + " urls.size() = " + urls.size() + " queue.size() = " +queue.size());
|
||||
if (metas.size() > 0) {
|
||||
url = metas.remove(0);
|
||||
try {
|
||||
c.get(url);
|
||||
} catch (CollectorServiceException e) {
|
||||
log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
|
||||
}
|
||||
if(c.isStatusOk()){
|
||||
try {
|
||||
String ret = c.getResponse();
|
||||
if (ret != null && ret.length()>0) {
|
||||
if (!containsFilter(ret))
|
||||
queue.put(addFilePath(ret, url, url.endsWith(".json")));
|
||||
//queue.offer(addFilePath(ret, url, url.endsWith(".json")), HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
|
||||
else
|
||||
filtered++;
|
||||
total++;
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
|
||||
|
||||
}
|
||||
}
|
||||
} else {
|
||||
url = urls.remove(0);
|
||||
try {
|
||||
c.get(url);
|
||||
} catch (CollectorServiceException e) {
|
||||
log.info("Impossible to collect url: " + url + " error: " + e.getMessage());
|
||||
}
|
||||
if(c.isStatusOk()) {
|
||||
if (c.responseTypeContains(TEXT_HTML)){
|
||||
recurFolder(c.getResponse(), url);
|
||||
} else if(c.responseTypeContains(APP_JSON) || c.responseTypeContains(APP_XML)){
|
||||
try {
|
||||
final String element = addFilePath(c.getResponse(), url, c.responseTypeContains(APP_JSON));
|
||||
//queue.offer(element, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
|
||||
queue.put(element);
|
||||
} catch (InterruptedException e) {
|
||||
log.info("not inserted in queue element associate to url " + url + " error: " + e.getMessage() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
try {
|
||||
//queue.offer(HttpWithFileNameCollectorIterator.TERMINATOR, HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS);
|
||||
queue.put(HttpWithFileNameCollectorIterator.TERMINATOR);
|
||||
} catch (InterruptedException e) {
|
||||
throw new IllegalStateException(String.format("could not add element to queue for more than %s%s", HttpWithFileNameCollectorIterator.waitTime, TimeUnit.SECONDS), e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private boolean containsFilter(String meta){
|
||||
if (filterParam == null || filterParam.isEmpty())
|
||||
return false;
|
||||
String[] filter = filterParam.split(";");
|
||||
for(String item:filter){
|
||||
if (meta.contains(item))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private String addFilePath(String meta, String url, boolean isJson){
|
||||
String path = url.replace("metadata", "pdf");
|
||||
|
||||
try {
|
||||
if(isJson)
|
||||
meta = meta.substring(0, meta.length() - 1) + ",'downloadFileUrl':'" + path.substring(0, path.indexOf(".json")) + ".pdf'}";
|
||||
else {
|
||||
|
||||
if (meta.contains("<!DOCTYPE")) {
|
||||
meta = meta.substring(meta.indexOf("<!DOCTYPE"));
|
||||
meta = meta.substring(meta.indexOf(">") + 1);
|
||||
}
|
||||
int index = meta.lastIndexOf("</");
|
||||
meta = meta.substring(0, index) + "<downloadFileUrl>" + path.substring(0, path.indexOf(".xml")) + ".pdf</downloadFileUrl>" + meta.substring(index);
|
||||
}
|
||||
} catch(Exception ex) {
|
||||
log.info("not file with extension .json or .xml");
|
||||
}
|
||||
|
||||
|
||||
if(isJson) {
|
||||
try {
|
||||
return XML.toString(new JSONObject("{'resource':" + meta + "}"));
|
||||
} catch(Exception e) {
|
||||
log.fatal("Impossible to transform json object to xml \n" + meta + "\n " + e.getMessage() + "\n" + url);
|
||||
// throw new RuntimeException();
|
||||
final String junk = String.format(JUNK, url);
|
||||
log.warn("returning " + junk);
|
||||
return junk;
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
private void recurFolder(String text, String url){
|
||||
Document doc = Jsoup.parse(text);
|
||||
Elements links = doc.select("a");
|
||||
for(Element e:links){
|
||||
if (!e.text().equals("../")){
|
||||
String file = e.attr("href");
|
||||
if(file.endsWith(".json") || file.endsWith(".xml"))
|
||||
metas.add(url+file);
|
||||
else
|
||||
urls.add(url+file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
fillQueue();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package eu.dnetlib.data.collector.plugins.httpfilename;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Created by miriam on 04/05/2018.
|
||||
*/
|
||||
public class HTTPWithFileNameCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(InterfaceDescriptor interfaceDescriptor, String s, String s1) throws CollectorServiceException {
|
||||
return new HTTPWithFileNameCollectorIterable(interfaceDescriptor.getBaseUrl(), interfaceDescriptor.getParams().get("filter"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.data.collector.plugins.httpfilename;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Created by miriam on 25/06/2018.
|
||||
*/
|
||||
public class HttpWithFileNameCollectorIterator implements Iterator<String> {
|
||||
public static final String TERMINATOR = "FINITO";
|
||||
private static final Log log = LogFactory.getLog(HttpWithFileNameCollectorIterator.class);
|
||||
|
||||
private final ArrayBlockingQueue<String> queue;
|
||||
|
||||
public static final long waitTime = 60L;
|
||||
|
||||
private String last = "<resource><DOI>JUNK</DOI></resource>";
|
||||
|
||||
public HttpWithFileNameCollectorIterator(ArrayBlockingQueue<String> queue) {
|
||||
this.queue = queue;
|
||||
extractFromQueue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
|
||||
//return !(Objects.equals(last, TERMINATOR) || Objects.equals(last,null));
|
||||
return !(Objects.equals(last, TERMINATOR));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
try{
|
||||
|
||||
return last;
|
||||
|
||||
}finally{
|
||||
extractFromQueue();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void extractFromQueue() {
|
||||
|
||||
|
||||
try {
|
||||
last = queue.take();
|
||||
//last = queue.poll(waitTime, TimeUnit.SECONDS);
|
||||
}catch(InterruptedException e){
|
||||
log.warn("Interrupted while waiting for element to consume");
|
||||
throw new NoSuchElementException(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package eu.dnetlib.data.collector.plugins.httplist;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
public class HttpListCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Autowired
|
||||
private HttpConnector httpConnector;
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
final String listAddress = interfaceDescriptor.getParams().get("listUrl");
|
||||
|
||||
return () -> new HttpListIterator(baseUrl, listAddress, httpConnector);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package eu.dnetlib.data.collector.plugins.httplist;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class HttpListIterator implements Iterator<String> {
|
||||
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
private String baseUrl;
|
||||
private String currentLine;
|
||||
private BufferedReader reader;
|
||||
|
||||
public HttpListIterator(final String baseUrl, final String listAddress, final HttpConnector httpConnector) {
|
||||
try {
|
||||
this.baseUrl = baseUrl;
|
||||
this.reader = new BufferedReader(new StringReader(download(listAddress)));
|
||||
this.httpConnector = httpConnector;
|
||||
this.currentLine = reader.readLine();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error creating iterator", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized boolean hasNext() {
|
||||
return StringUtils.isNotBlank(currentLine);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized String next() {
|
||||
try {
|
||||
if (StringUtils.isNotBlank(currentLine)) {
|
||||
return download(baseUrl + currentLine);
|
||||
} else {
|
||||
throw new RuntimeException("Iterator has reached the end");
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
this.currentLine = reader.readLine();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Error obtaining next element " + currentLine, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String download(final String url) {
|
||||
try {
|
||||
return httpConnector.getInputSource(url);
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.data.collector.plugins.mongo;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.util.Iterator;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.information.collectionservice.rmi.CollectionServiceException;
|
||||
|
||||
/**
|
||||
* The Class MongoDumpIterable.
|
||||
*/
|
||||
public class MongoDumpIterable implements Iterable<String> {
|
||||
|
||||
/** The input stream. */
|
||||
private final FileReader inputStream;
|
||||
|
||||
/**
|
||||
* Instantiates a new mongo dump iterable.
|
||||
*
|
||||
* @param inputFile the input file
|
||||
* @throws CollectionServiceException the collection service exception
|
||||
*/
|
||||
public MongoDumpIterable(final File inputFile) throws CollectorServiceException {
|
||||
try {
|
||||
this.inputStream = new FileReader(inputFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new CollectorServiceException("Error unable to open inputStream", e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new MongoDumpIterator(inputStream);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.data.collector.plugins.mongo;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
public class MongoDumpIterator implements Iterator<String> {
|
||||
|
||||
private final BufferedReader inputStream;
|
||||
private String currentLine = null;
|
||||
|
||||
public MongoDumpIterator(final FileReader inputStream) {
|
||||
this.inputStream = new BufferedReader(inputStream);
|
||||
this.currentLine = getNextLine();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentLine != null;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
final String returnedString = this.currentLine;
|
||||
this.currentLine = getNextLine();
|
||||
return returnedString;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
private String getNextLine() {
|
||||
try {
|
||||
String input = inputStream.readLine();
|
||||
while (input != null) {
|
||||
JsonElement jElement = new JsonParser().parse(input);
|
||||
JsonObject jobject = jElement.getAsJsonObject();
|
||||
if (jobject.has("body")) { return jobject.get("body").getAsString(); }
|
||||
input = inputStream.readLine();
|
||||
}
|
||||
return null;
|
||||
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package eu.dnetlib.data.collector.plugins.mongo;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
public class MongoDumpPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
final File f = new File(baseUrl);
|
||||
if (f.exists() == false) { throw new CollectorServiceException("the file at url " + baseUrl + " does not exists"); }
|
||||
|
||||
return new MongoDumpIterable(f);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package eu.dnetlib.data.collector.plugins.oai;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
public class OaiCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private static final String FORMAT_PARAM = "format";
|
||||
private static final String OAI_SET_PARAM = "set";
|
||||
|
||||
private OaiIteratorFactory oaiIteratorFactory;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
final String mdFormat = interfaceDescriptor.getParams().get(FORMAT_PARAM);
|
||||
final String setParam = interfaceDescriptor.getParams().get(OAI_SET_PARAM);
|
||||
final List<String> sets = Lists.newArrayList();
|
||||
if (setParam != null) {
|
||||
sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
|
||||
}
|
||||
if (sets.isEmpty()) {
|
||||
// If no set is defined, ALL the sets must be harvested
|
||||
sets.add("");
|
||||
}
|
||||
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
|
||||
if (mdFormat == null || mdFormat.isEmpty()) { throw new CollectorServiceException("Param 'mdFormat' is null or empty"); }
|
||||
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
|
||||
|
||||
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + untilDate); }
|
||||
|
||||
return () -> Iterators.concat(
|
||||
sets.stream()
|
||||
.map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
||||
.iterator());
|
||||
}
|
||||
|
||||
public OaiIteratorFactory getOaiIteratorFactory() {
|
||||
return oaiIteratorFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setOaiIteratorFactory(final OaiIteratorFactory oaiIteratorFactory) {
|
||||
this.oaiIteratorFactory = oaiIteratorFactory;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,168 @@
|
|||
package eu.dnetlib.data.collector.plugins.oai;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
|
||||
public class OaiIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private Queue<String> queue = new PriorityBlockingQueue<String>();
|
||||
private SAXReader reader = new SAXReader();
|
||||
|
||||
private String baseUrl;
|
||||
private String set;
|
||||
private String mdFormat;
|
||||
private String fromDate;
|
||||
private String untilDate;
|
||||
private String token;
|
||||
private boolean started;
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, final HttpConnector httpConnector) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.mdFormat = mdFormat;
|
||||
this.set = set;
|
||||
this.fromDate = fromDate;
|
||||
this.untilDate = untilDate;
|
||||
this.started = false;
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (!this.started) {
|
||||
this.started = true;
|
||||
try {
|
||||
this.token = firstPage();
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
return !queue.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
final String res = queue.poll();
|
||||
while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
|
||||
try {
|
||||
token = otherPages(token);
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
private String firstPage() throws CollectorServiceException {
|
||||
try {
|
||||
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat,"UTF-8");
|
||||
if ((set != null) && !set.isEmpty()) {
|
||||
url += "&set=" + URLEncoder.encode(set,"UTF-8");
|
||||
}
|
||||
if ((fromDate != null) && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
url += "&from=" + URLEncoder.encode(fromDate,"UTF-8");
|
||||
}
|
||||
if ((untilDate != null) && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
url += "&until=" + URLEncoder.encode(untilDate,"UTF-8");
|
||||
}
|
||||
log.info("Start harvesting using url: " + url);
|
||||
|
||||
return downloadPage(url);
|
||||
} catch(UnsupportedEncodingException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String extractResumptionToken(final String xml) {
|
||||
|
||||
final String s = StringUtils.substringAfter(xml, "<resumptionToken");
|
||||
if (s == null){
|
||||
return null;
|
||||
}
|
||||
|
||||
final String result = StringUtils.substringBetween(s, ">", "</");
|
||||
if (result == null)
|
||||
return null;
|
||||
return result.trim();
|
||||
|
||||
|
||||
}
|
||||
|
||||
private String otherPages(final String resumptionToken) throws CollectorServiceException {
|
||||
try {
|
||||
return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken,"UTF-8"));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String downloadPage(final String url) throws CollectorServiceException {
|
||||
|
||||
final String xml = httpConnector.getInputSource(url);
|
||||
Document doc;
|
||||
try {
|
||||
doc = reader.read(new StringReader(xml));
|
||||
} catch (DocumentException e) {
|
||||
log.warn("Error parsing xml, I try to clean it: " + xml, e);
|
||||
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
||||
try {
|
||||
doc = reader.read(new StringReader(cleaned));
|
||||
} catch (DocumentException e1) {
|
||||
final String resumptionToken = extractResumptionToken(xml);
|
||||
if (resumptionToken == null)
|
||||
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
|
||||
return resumptionToken;
|
||||
}
|
||||
}
|
||||
|
||||
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
||||
if (errorNode != null) {
|
||||
final String code = errorNode.valueOf("@code");
|
||||
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
|
||||
log.warn("noRecordsMatch for oai call: " + url);
|
||||
return null;
|
||||
} else {
|
||||
throw new CollectorServiceException(code + " - " + errorNode.getText());
|
||||
}
|
||||
}
|
||||
|
||||
for (Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
|
||||
queue.add(((Node) o).asXML());
|
||||
}
|
||||
|
||||
return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.data.collector.plugins.oai;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
|
||||
public class OaiIteratorFactory {
|
||||
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
public Iterator<String> newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) {
|
||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector);
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setHttpConnector(HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,268 @@
|
|||
package eu.dnetlib.data.collector.plugins.oai.engine;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author jochen, Andreas Czerniak
|
||||
*
|
||||
*/
|
||||
public class XmlCleaner {
|
||||
/**
|
||||
* Pattern for numeric entities.
|
||||
*/
|
||||
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
|
||||
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
|
||||
|
||||
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
|
||||
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
|
||||
|
||||
/**
|
||||
* Pattern that negates the allowable XML 4 byte unicode characters. Valid
|
||||
* are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
|
||||
* [#x10000-#x10FFFF]
|
||||
*/
|
||||
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
|
||||
|
||||
// Map entities to their unicode equivalent
|
||||
private static Set<String> goodEntities = new HashSet<String>();
|
||||
private static Map<String, String> badEntities = new HashMap<String, String>();
|
||||
|
||||
static {
|
||||
// pre-defined XML entities
|
||||
goodEntities.add("""); //$NON-NLS-1$ // quotation mark
|
||||
goodEntities.add("&"); //$NON-NLS-1$ // ampersand
|
||||
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign
|
||||
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign
|
||||
// control entities
|
||||
//badEntities.put("", "");
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
// misc entities
|
||||
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
|
||||
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
|
||||
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
|
||||
// Latin 1 entities
|
||||
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
|
||||
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
|
||||
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
|
||||
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
|
||||
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
|
||||
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
|
||||
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
|
||||
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
|
||||
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
|
||||
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
|
||||
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
|
||||
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
|
||||
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
|
||||
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
|
||||
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
|
||||
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
|
||||
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
|
||||
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
|
||||
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
|
||||
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
|
||||
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
|
||||
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
|
||||
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
|
||||
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
|
||||
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
|
||||
badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
|
||||
badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
|
||||
badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
|
||||
badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
|
||||
badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
|
||||
badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
|
||||
badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
|
||||
badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
|
||||
badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
|
||||
badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
|
||||
badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
|
||||
badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
|
||||
badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
|
||||
badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
|
||||
badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
|
||||
badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
|
||||
badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
|
||||
badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
|
||||
badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
|
||||
badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
|
||||
badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
|
||||
badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
|
||||
badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
|
||||
badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
|
||||
badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
|
||||
badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
|
||||
badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
|
||||
badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
|
||||
badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
|
||||
badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
|
||||
badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
|
||||
badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
|
||||
badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
|
||||
badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
|
||||
badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
|
||||
badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
|
||||
badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
|
||||
badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
|
||||
badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
|
||||
badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
|
||||
badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
|
||||
badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
|
||||
badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
|
||||
badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
|
||||
badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
|
||||
badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
|
||||
badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
|
||||
badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
|
||||
badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
|
||||
badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
|
||||
badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
|
||||
badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
|
||||
badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
|
||||
badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
|
||||
badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
|
||||
badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
|
||||
badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
|
||||
badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
|
||||
badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
|
||||
badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
|
||||
badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
|
||||
badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
|
||||
badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
|
||||
badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
|
||||
badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
|
||||
badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
|
||||
badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
|
||||
badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
|
||||
badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
|
||||
badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
|
||||
badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
|
||||
}
|
||||
/**
|
||||
* For each entity in the input that is not allowed in XML, replace the
|
||||
* entity with its unicode equivalent or remove it. For each instance of a
|
||||
* bare {@literal &}, replace it with {@literal &<br/>}
|
||||
* XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}.
|
||||
*
|
||||
* @param broken
|
||||
* the string to handle entities
|
||||
* @return the string with entities appropriately fixed up
|
||||
*/
|
||||
static public String cleanAllEntities(final String broken) {
|
||||
if (broken == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
|
||||
working = invalidCharacterPattern.matcher(working).replaceAll("");
|
||||
|
||||
int cleanfrom = 0;
|
||||
|
||||
while (true) {
|
||||
int amp = working.indexOf('&', cleanfrom);
|
||||
// If there are no more amps then we are done
|
||||
if (amp == -1) {
|
||||
break;
|
||||
}
|
||||
// Skip references of the kind &#ddd;
|
||||
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
|
||||
cleanfrom = working.indexOf(';', amp) + 1;
|
||||
continue;
|
||||
}
|
||||
int i = amp + 1;
|
||||
while (true) {
|
||||
// if we are at the end of the string then just escape the '&';
|
||||
if (i >= working.length()) {
|
||||
return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
}
|
||||
// if we have come to a ; then we have an entity
|
||||
// If it is something that xml can't handle then replace it.
|
||||
char c = working.charAt(i);
|
||||
if (c == ';') {
|
||||
final String entity = working.substring(amp, i + 1);
|
||||
final String replace = handleEntity(entity);
|
||||
working = working.substring(0, amp) + replace + working.substring(i + 1);
|
||||
break;
|
||||
}
|
||||
// Did we end an entity without finding a closing ;
|
||||
// Then treat it as an '&' that needs to be replaced with &
|
||||
if (!Character.isLetterOrDigit(c)) {
|
||||
working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
amp = i + 4; // account for the 4 extra characters
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
cleanfrom = amp + 1;
|
||||
}
|
||||
|
||||
if (Pattern.compile("<<").matcher(working).find()) {
|
||||
working = working.replaceAll("<<", "<<");
|
||||
}
|
||||
|
||||
if (Pattern.compile(">>").matcher(working).find()) {
|
||||
working = working.replaceAll(">>", ">>");
|
||||
}
|
||||
|
||||
return working;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace entity with its unicode equivalent, if it is not a valid XML
|
||||
* entity. Otherwise strip it out. XML only allows 4 entities: &amp;,
|
||||
* &quot;, &lt; and &gt;.
|
||||
*
|
||||
* @param entity
|
||||
* the entity to be replaced
|
||||
* @return the substitution for the entity, either itself, the unicode
|
||||
* equivalent or an empty string.
|
||||
*/
|
||||
private static String handleEntity(final String entity) {
|
||||
if (goodEntities.contains(entity)) {
|
||||
return entity;
|
||||
}
|
||||
|
||||
final String replace = (String) badEntities.get(entity);
|
||||
if (replace != null) {
|
||||
return replace;
|
||||
}
|
||||
|
||||
return replace != null ? replace : "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.data.collector.plugins.oaisets;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
public class OaiSetsCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private OaiSetsIteratorFactory oaiSetsIteratorFactory;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new CollectorServiceException("Param 'baseurl' is null or empty"); }
|
||||
|
||||
return new Iterable<String>() {
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return oaiSetsIteratorFactory.newIterator(baseUrl);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public OaiSetsIteratorFactory getOaiSetsIteratorFactory() {
|
||||
return oaiSetsIteratorFactory;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setOaiSetsIteratorFactory(final OaiSetsIteratorFactory oaiSetsIteratorFactory) {
|
||||
this.oaiSetsIteratorFactory = oaiSetsIteratorFactory;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
package eu.dnetlib.data.collector.plugins.oaisets;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
|
||||
public class OaiSetsIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OaiSetsIterator.class);
|
||||
|
||||
private Queue<String> queue = new PriorityBlockingQueue<String>();
|
||||
private SAXReader reader = new SAXReader();
|
||||
|
||||
private String baseUrl;
|
||||
|
||||
private String token;
|
||||
private boolean started;
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
private Set<String> setsAlreadySeen = Sets.newHashSet();
|
||||
|
||||
public OaiSetsIterator(final String baseUrl, final HttpConnector httpConnector) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.started = false;
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (!this.started) {
|
||||
this.started = true;
|
||||
try {
|
||||
this.token = firstPage();
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
return !queue.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
final String res = queue.poll();
|
||||
while (queue.isEmpty() && (token != null) && !token.isEmpty()) {
|
||||
try {
|
||||
token = otherPages(token);
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
private String firstPage() throws CollectorServiceException {
|
||||
final String url = baseUrl + "?verb=ListSets";
|
||||
log.info("Start harvesting using url: " + url);
|
||||
return downloadPage(url);
|
||||
}
|
||||
|
||||
private String otherPages(final String resumptionToken) throws CollectorServiceException {
|
||||
return downloadPage(baseUrl + "?verb=ListSets&resumptionToken=" + resumptionToken);
|
||||
}
|
||||
|
||||
private String downloadPage(final String url) throws CollectorServiceException {
|
||||
|
||||
final String xml = httpConnector.getInputSource(url);
|
||||
|
||||
Document doc;
|
||||
try {
|
||||
doc = reader.read(new StringReader(xml));
|
||||
} catch (DocumentException e) {
|
||||
log.warn("Error parsing xml, I try to clean it: " + xml, e);
|
||||
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
||||
try {
|
||||
doc = reader.read(new StringReader(cleaned));
|
||||
} catch (DocumentException e1) {
|
||||
throw new CollectorServiceException("Error parsing cleaned document:" + cleaned, e1);
|
||||
}
|
||||
}
|
||||
|
||||
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
||||
if (errorNode != null) {
|
||||
final String code = errorNode.valueOf("@code");
|
||||
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
|
||||
log.warn("noRecordsMatch for oai call: " + url);
|
||||
return null;
|
||||
} else throw new CollectorServiceException(code + " - " + errorNode.getText());
|
||||
}
|
||||
|
||||
boolean sawAllSets = true;
|
||||
for (Object o : doc.selectNodes("//*[local-name()='ListSets']/*[local-name()='set']")) {
|
||||
String set = ((Element) o).valueOf("./*[local-name()='setSpec']");
|
||||
if (!setsAlreadySeen.contains(set)) {
|
||||
sawAllSets = false;
|
||||
setsAlreadySeen.add(set);
|
||||
queue.add(((Node) o).asXML());
|
||||
}
|
||||
}
|
||||
if (sawAllSets) {
|
||||
log.warn("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin.");
|
||||
System.out.println("URL " + baseUrl + " keeps returning the same OAI sets. Please contact the repo admin.");
|
||||
return null;
|
||||
} else return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package eu.dnetlib.data.collector.plugins.oaisets;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Required;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
|
||||
public class OaiSetsIteratorFactory {
|
||||
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
public Iterator<String> newIterator(String baseUrl) {
|
||||
return new OaiSetsIterator(baseUrl, httpConnector);
|
||||
}
|
||||
|
||||
public HttpConnector getHttpConnector() {
|
||||
return httpConnector;
|
||||
}
|
||||
|
||||
@Required
|
||||
public void setHttpConnector(HttpConnector httpConnector) {
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package eu.dnetlib.data.collector.plugins.opentrial;
|
||||
|
||||
/**
|
||||
* Created by miriam on 07/03/2017.
|
||||
*/
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import java.net.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
//import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.*;
|
||||
|
||||
|
||||
|
||||
public class OpenTrialIterator implements Iterable<String> {
|
||||
|
||||
private final String base_url;
|
||||
private int total ;
|
||||
private ArrayBlockingQueue<String> trials = new ArrayBlockingQueue<String>(100);
|
||||
private int current = 0;
|
||||
private static final Log log = LogFactory.getLog(OpenTrialIterator.class);
|
||||
|
||||
public OpenTrialIterator(String base_url, String from_date, String to_date)throws CollectorServiceException{
|
||||
try {
|
||||
String q = "per_page=100";
|
||||
if (!(from_date == null)) {
|
||||
if (!(to_date == null)) {
|
||||
q = "q=registration_date%3A%5B" + from_date + "%20TO%20" + to_date + "%5D&" + q;
|
||||
|
||||
} else
|
||||
q = "q=registration_date%3A%5B" + from_date + "%20TO%20*%5D&" + q;
|
||||
}
|
||||
this.base_url = base_url+ q;
|
||||
log.info("url from which to collect " + this.base_url);
|
||||
prepare();
|
||||
}catch(Exception ex){
|
||||
throw new CollectorServiceException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private void prepare()throws Exception {
|
||||
JSONObject json = new JSONObject(getPage(1));
|
||||
total = json.getInt("total_count");
|
||||
log.info("Total number of entries to collect: " + total);
|
||||
fillTrials(json);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new Iterator<String>(){
|
||||
|
||||
private int page_number = 2;
|
||||
|
||||
|
||||
@Override
|
||||
public void remove(){
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
try {
|
||||
if (trials.isEmpty()) {
|
||||
JSONObject json = new JSONObject(getPage(page_number));
|
||||
fillTrials(json);
|
||||
page_number++;
|
||||
}
|
||||
return trials.poll();
|
||||
}catch(Exception ex){
|
||||
throw new CollectorServiceRuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext(){
|
||||
log.debug("More entries to collect: (" + current + "<" + total + "=" + (current < total));
|
||||
return (current < total || !trials.isEmpty());
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
private void fillTrials(JSONObject json)throws CollectorServiceException{
|
||||
|
||||
JSONArray entries = json.getJSONArray("items");
|
||||
for(Object entry: entries) {
|
||||
try {
|
||||
trials.put(XML.toString(entry));
|
||||
}catch(Exception ex){
|
||||
throw new CollectorServiceException(ex);
|
||||
}
|
||||
current++;
|
||||
}
|
||||
|
||||
}
|
||||
private String getPage(int page_number)throws CollectorServiceException {
|
||||
|
||||
try {
|
||||
URL url = new URL(base_url + "&page=" + page_number);
|
||||
URLConnection conn = url.openConnection();
|
||||
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
|
||||
return (IOUtils.toString(conn.getInputStream()));
|
||||
}catch(Exception ex){
|
||||
throw new CollectorServiceException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package eu.dnetlib.data.collector.plugins.opentrial;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Created by miriam on 07/03/2017.
|
||||
*/
|
||||
public class OpenTrialPlugin extends AbstractCollectorPlugin{
|
||||
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
try {
|
||||
|
||||
OpenTrialIterator iterator = new OpenTrialIterator(interfaceDescriptor.getBaseUrl(),fromDate,untilDate);
|
||||
return iterator;
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceException("OOOPS something bad happen on creating iterator ", e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.data.collector.plugins.projects.grist;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Plugin to collect metadata record about projects and fundings via the europePMC GRIST API (e.g. WT projects).
|
||||
* <p>
|
||||
* Documentation on GRIST API: http://europepmc.org/GristAPI.
|
||||
* </p>
|
||||
* <p>
|
||||
* BaseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:"Wellcome Trust"&resultType=core
|
||||
* where resultType=core asks for the complete information (including abstracts).
|
||||
* The results returned by the API are XMLs.
|
||||
* </p>
|
||||
* <p>
|
||||
* Pagination: use parameter 'page'. When the response contains empty 'RecordList', it means we reached the end.
|
||||
* </p>
|
||||
*
|
||||
* @author alessia
|
||||
*/
|
||||
public class GristCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
//baseURL: http://www.ebi.ac.uk/europepmc/GristAPI/rest/get/query=ga:%22Wellcome%20Trust%22&resultType=core
|
||||
return new GristProjectsIterable(interfaceDescriptor.getBaseUrl());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
package eu.dnetlib.data.collector.plugins.projects.grist;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import eu.dnetlib.enabling.resultset.SizedIterable;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
public class GristProjectsIterable implements SizedIterable<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(GristProjectsIterable.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private String queryURL;
|
||||
private int total;
|
||||
private SAXReader reader;
|
||||
|
||||
public GristProjectsIterable(String baseURL) throws CollectorServiceException {
|
||||
queryURL = baseURL;
|
||||
reader = new SAXReader();
|
||||
total = getTotalCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumberOfElements() {
|
||||
return total;
|
||||
}
|
||||
|
||||
private int getTotalCount() throws CollectorServiceException {
|
||||
try {
|
||||
URL pageUrl = new URL(queryURL);
|
||||
log.debug("Getting hit count from: " + pageUrl.toString());
|
||||
String resultPage = IOUtils.toString(pageUrl);
|
||||
Document doc = reader.read(IOUtils.toInputStream(resultPage));
|
||||
String hitCount = doc.selectSingleNode("/Response/HitCount").getText();
|
||||
return Integer.parseInt(hitCount);
|
||||
} catch (NumberFormatException e) {
|
||||
log.warn("Cannot set the total count from '/Response/HitCount'");
|
||||
} catch (DocumentException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
} catch (IOException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new Iterator<String>() {
|
||||
|
||||
private Queue<String> projects = new PriorityBlockingQueue<String>();
|
||||
private boolean morePages = true;
|
||||
private int pageNumber = 0;
|
||||
private SAXReader reader = new SAXReader();
|
||||
//The following is for debug only
|
||||
private int nextCounter = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
fillProjectListIfNeeded();
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new CollectorServiceRuntimeException(e);
|
||||
}
|
||||
return !projects.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
nextCounter++;
|
||||
log.debug(String.format("Calling next %s times. projects queue has %s elements", nextCounter, projects.size()));
|
||||
try {
|
||||
fillProjectListIfNeeded();
|
||||
return projects.poll();
|
||||
} catch (CollectorServiceException e) {
|
||||
throw new CollectorServiceRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private boolean fillProjectListIfNeeded() throws CollectorServiceException {
|
||||
if (morePages && projects.isEmpty()) {
|
||||
String resultPage = getNextPage();
|
||||
Document doc = null;
|
||||
try {
|
||||
doc = reader.read(IOUtils.toInputStream(resultPage));
|
||||
List<Element> records = doc.selectNodes("//RecordList/Record");
|
||||
if (records != null && !records.isEmpty()) {
|
||||
for (Element p : records) {
|
||||
|
||||
projects.add(p.asXML());
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
log.info("No more projects to read at page nr. " + pageNumber);
|
||||
morePages = false;
|
||||
return false;
|
||||
}
|
||||
} catch (DocumentException e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
} else return false;
|
||||
}
|
||||
|
||||
private String getNextPage() {
|
||||
pageNumber++;
|
||||
try {
|
||||
URL pageUrl = new URL(queryURL + "&page=" + pageNumber);
|
||||
log.debug("Getting page at: " + pageUrl.toString());
|
||||
return IOUtils.toString(pageUrl);
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceRuntimeException("Error on page " + pageNumber, e);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.data.collector.plugins.projects.gtr2;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Plugin to collect metadata record about projects and fundings via the RCUK grt2 API.
|
||||
* <p>
|
||||
* Documentation : http://gtr.rcuk.ac.uk/resources/api.html.
|
||||
* </p>
|
||||
* <p>
|
||||
* BaseURL: http://gtr.rcuk.ac.uk/gtr/api
|
||||
* The results returned by the API are XMLs.
|
||||
* </p>
|
||||
* <p>
|
||||
* Pagination: TO BE DEFINED. Exceeding the number of pages available will result in a HTTP response code of 404
|
||||
* </p>
|
||||
*
|
||||
* @author alessia
|
||||
*/
|
||||
public class Gtr2CollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate)
|
||||
throws CollectorServiceException {
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
|
||||
|
||||
return new Gtr2ProjectsIterable(interfaceDescriptor.getBaseUrl(), fromDate);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,181 @@
|
|||
package eu.dnetlib.data.collector.plugins.projects.gtr2;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.commons.lang3.*;
|
||||
|
||||
public class Gtr2Helper {
|
||||
|
||||
private static final Log log = LogFactory.getLog(Gtr2Helper.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private VTDNav mainVTDNav;
|
||||
private AutoPilot mainAutoPilot;
|
||||
private StringWriter writer;
|
||||
private HttpConnector connector;
|
||||
//private BlockingQueue<String> fragment = new ArrayBlockingQueue<String>(20);
|
||||
|
||||
public String processProject(final VTDNav vn, final String namespaces) throws Exception {
|
||||
//log.debug("Processing project at "+projectURL);
|
||||
writer = new StringWriter();
|
||||
mainVTDNav = vn;
|
||||
mainAutoPilot = new AutoPilot(mainVTDNav);
|
||||
writer.write("<doc " + namespaces + ">");
|
||||
writeFragment(mainVTDNav);
|
||||
|
||||
mainAutoPilot.selectXPath("//link[@rel='FUND']");
|
||||
ExecutorService es = Executors.newFixedThreadPool(5);
|
||||
|
||||
while (mainAutoPilot.evalXPath() != -1) {
|
||||
Thread t = new Thread(new ProcessFunder(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
||||
es.execute(t);
|
||||
}
|
||||
|
||||
mainAutoPilot.resetXPath();
|
||||
mainAutoPilot.selectXPath(".//link[@rel='LEAD_ORG']");
|
||||
while (mainAutoPilot.evalXPath() != -1) {
|
||||
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
||||
new String[] { "<ld-org>", "</ld-org>" }));
|
||||
es.execute(t);
|
||||
}
|
||||
mainAutoPilot.resetXPath();
|
||||
mainAutoPilot.selectXPath(".//link[@rel='PP_ORG']");
|
||||
while (mainAutoPilot.evalXPath() != -1) {
|
||||
Thread t = new Thread(new Org(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href")),
|
||||
new String[] { "<pp-org>","</pp-org>" }));
|
||||
es.execute(t);
|
||||
}
|
||||
mainAutoPilot.resetXPath();
|
||||
|
||||
mainAutoPilot.selectXPath(".//link[@rel='PI_PER']");
|
||||
while (mainAutoPilot.evalXPath() != -1) {
|
||||
Thread t = new Thread(new PiPer(mainVTDNav.toNormalizedString(mainVTDNav.getAttrVal("href"))));
|
||||
es.execute(t);
|
||||
}
|
||||
es.shutdown();
|
||||
log.debug("Waiting threads");
|
||||
es.awaitTermination(10, TimeUnit.MINUTES);
|
||||
|
||||
log.debug("Finished writing project");
|
||||
writer.write("</doc>");
|
||||
writer.close();
|
||||
|
||||
return writer.toString();
|
||||
}
|
||||
|
||||
private VTDNav setNavigator(final String httpUrl) {
|
||||
VTDGen vg_tmp = new VTDGen();
|
||||
connector = new HttpConnector();
|
||||
try {
|
||||
byte[] bytes = connector.getInputSource(httpUrl).getBytes("UTF-8");
|
||||
vg_tmp.setDoc(bytes);
|
||||
vg_tmp.parse(false);
|
||||
//vg_tmp.parseHttpUrl(httpUrl, false);
|
||||
return vg_tmp.getNav();
|
||||
}catch (Throwable e){
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private int evalXpath(final VTDNav fragmentVTDNav, final String xPath) throws Exception {
|
||||
|
||||
AutoPilot ap_tmp = new AutoPilot(fragmentVTDNav);
|
||||
ap_tmp.selectXPath(xPath);
|
||||
return ap_tmp.evalXPath();
|
||||
}
|
||||
|
||||
private void writeFragment(final VTDNav nav) throws Exception {
|
||||
ByteArrayOutputStream b = new ByteArrayOutputStream();
|
||||
nav.dumpFragment(b);
|
||||
String ret = b.toString();
|
||||
b.reset();
|
||||
writer.write(ret);
|
||||
}
|
||||
|
||||
private void writeNewTagAndInfo(final VTDNav vn, final String xPath, final String xmlOpenTag, final String xmlCloseTag, final String attrName) throws Exception {
|
||||
|
||||
int nav_res = evalXpath(vn, xPath);
|
||||
if (nav_res != -1) {
|
||||
String tmp = xmlOpenTag;
|
||||
if (attrName != null) tmp += (vn.toNormalizedString(vn.getAttrVal(attrName)));
|
||||
else
|
||||
tmp += (StringEscapeUtils.escapeXml11(vn.toNormalizedString(vn.getText())));
|
||||
tmp += (xmlCloseTag);
|
||||
writer.write(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
private class PiPer implements Runnable {
|
||||
|
||||
private VTDNav vn;
|
||||
|
||||
public PiPer(String httpURL) {
|
||||
vn = setNavigator(httpURL);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
writeFragment(vn);
|
||||
} catch (Throwable e) {log.debug("Eccezione in PiPer " + e.getMessage());}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private class Org implements Runnable {
|
||||
|
||||
private String[] tags;
|
||||
private VTDNav vn;
|
||||
|
||||
public Org(final String httpURL, final String[] tags) {
|
||||
vn = setNavigator(httpURL);
|
||||
this.tags = tags;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
writeNewTagAndInfo(vn, "//name", tags[0]+"<name>", "</name>", null);
|
||||
vn.toElement(VTDNav.ROOT);
|
||||
writeNewTagAndInfo(vn, "//country", "<country>", "</country>", null);
|
||||
vn.toElement(VTDNav.ROOT);
|
||||
writeNewTagAndInfo(vn, ".", "<id>", "</id>"+tags[1], "id");
|
||||
} catch (Throwable e) {
|
||||
log.debug("Eccezione in Org " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private class ProcessFunder implements Runnable {
|
||||
|
||||
private VTDNav vn;
|
||||
|
||||
public ProcessFunder(final String httpURL) {
|
||||
vn = setNavigator(httpURL);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
try {
|
||||
AutoPilot ap = new AutoPilot(vn);
|
||||
writeFragment(vn);
|
||||
ap.selectXPath(".//link[@rel='FUNDER']");
|
||||
VTDNav tmp_vn;
|
||||
while (ap.evalXPath() != -1) {
|
||||
tmp_vn = setNavigator(vn.toNormalizedString(vn.getAttrVal("href")));
|
||||
writeNewTagAndInfo(tmp_vn, "//name", "<funder> <name>", "</name></funder>", null);
|
||||
}
|
||||
} catch (Throwable e) {log.debug("Eccezione in Funder" + e.getMessage());}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,352 @@
|
|||
package eu.dnetlib.data.collector.plugins.projects.gtr2;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import eu.dnetlib.enabling.resultset.SizedIterable;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import eu.dnetlib.data.collector.plugins.HttpConnector;
|
||||
|
||||
/**
|
||||
* Created by alessia on 28/11/16.
|
||||
*/
|
||||
public class Gtr2ProjectsIterable implements SizedIterable<String> {
|
||||
|
||||
public static final String TERMINATOR = "ARNOLD";
|
||||
public static final int WAIT_END_SECONDS = 120;
|
||||
public static final int PAGE_SZIE = 20;
|
||||
|
||||
private static final Log log = LogFactory.getLog(Gtr2ProjectsIterable.class);
|
||||
|
||||
private String queryURL;
|
||||
private int total = -1;
|
||||
private int startFromPage = 1;
|
||||
private int endAtPage;
|
||||
private VTDGen vg;
|
||||
private VTDNav vn;
|
||||
private AutoPilot ap;
|
||||
private String namespaces;
|
||||
private boolean incremental = false;
|
||||
private DateTime fromDate;
|
||||
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
private ArrayBlockingQueue<String> projects = new ArrayBlockingQueue<String>(20);
|
||||
//private boolean finished = false;
|
||||
private final ExecutorService es = Executors.newFixedThreadPool(PAGE_SZIE);
|
||||
private String nextElement = null;
|
||||
private HttpConnector connector;
|
||||
|
||||
public Gtr2ProjectsIterable(final String baseUrl, final String fromDate) throws CollectorServiceException {
|
||||
prepare(baseUrl, fromDate);
|
||||
fillInfo(true);
|
||||
}
|
||||
|
||||
public Gtr2ProjectsIterable(final String baseUrl, final String fromDate, final int startFromPage, final int endAtPage) throws CollectorServiceException {
|
||||
prepare(baseUrl, fromDate);
|
||||
this.setStartFromPage(startFromPage);
|
||||
this.setEndAtPage(endAtPage);
|
||||
fillInfo(false);
|
||||
}
|
||||
|
||||
private void prepare(final String baseUrl, final String fromDate) {
|
||||
connector = new HttpConnector();
|
||||
queryURL = baseUrl + "/projects";
|
||||
vg = new VTDGen();
|
||||
this.incremental = StringUtils.isNotBlank(fromDate);
|
||||
if (incremental) {
|
||||
// I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode
|
||||
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
|
||||
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumberOfElements() {
|
||||
return total;
|
||||
}
|
||||
|
||||
private void fillInfo(final boolean all) throws CollectorServiceException {
|
||||
try {
|
||||
// log.debug("Getting hit count from: " + queryURL);
|
||||
byte[] bytes = connector.getInputSource(queryURL).getBytes("UTF-8");
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(false);
|
||||
//vg.parseHttpUrl(queryURL, false);
|
||||
initParser();
|
||||
String hitCount = vn.toNormalizedString(vn.getAttrVal("totalSize"));
|
||||
String totalPages = vn.toNormalizedString(vn.getAttrVal("totalPages"));
|
||||
namespaces = "xmlns:ns1=\"" + vn.toNormalizedString(vn.getAttrVal("ns1")) + "\" ";
|
||||
namespaces += "xmlns:ns2=\"" + vn.toNormalizedString(vn.getAttrVal("ns2")) + "\" ";
|
||||
namespaces += "xmlns:ns3=\"" + vn.toNormalizedString(vn.getAttrVal("ns3")) + "\" ";
|
||||
namespaces += "xmlns:ns4=\"" + vn.toNormalizedString(vn.getAttrVal("ns4")) + "\" ";
|
||||
namespaces += "xmlns:ns5=\"" + vn.toNormalizedString(vn.getAttrVal("ns5")) + "\" ";
|
||||
namespaces += "xmlns:ns6=\"" + vn.toNormalizedString(vn.getAttrVal("ns6")) + "\" ";
|
||||
if (all) {
|
||||
setEndAtPage(Integer.parseInt(totalPages));
|
||||
total = Integer.parseInt(hitCount);
|
||||
}
|
||||
Thread ft = new Thread(new FillProjectList());
|
||||
ft.start();
|
||||
log.debug("Expected number of pages: " + (endAtPage - startFromPage + 1));
|
||||
} catch (NumberFormatException e) {
|
||||
log.error("Cannot set the total count or the number of pages");
|
||||
throw new CollectorServiceException(e);
|
||||
} catch (Throwable e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
|
||||
return new Iterator<String>() {
|
||||
// The following is for debug only
|
||||
private int nextCounter = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
log.debug("hasNext?");
|
||||
if (nextElement == null) {
|
||||
nextElement = projects.poll(WAIT_END_SECONDS, TimeUnit.SECONDS);
|
||||
log.debug("Exit poll :-)");
|
||||
}
|
||||
return nextElement != null && !nextElement.equals(TERMINATOR);
|
||||
} catch (InterruptedException e) {
|
||||
throw new CollectorServiceRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
nextCounter++;
|
||||
log.debug(String.format("Calling next %s times.", nextCounter));
|
||||
|
||||
if (nextElement == null) throw new NoSuchElementException();
|
||||
else {
|
||||
String res = nextElement;
|
||||
nextElement = null;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
private void initParser() {
|
||||
vn = vg.getNav();
|
||||
ap = new AutoPilot(vn);
|
||||
}
|
||||
|
||||
public String getQueryURL() {
|
||||
return queryURL;
|
||||
}
|
||||
|
||||
public void setQueryURL(final String queryURL) {
|
||||
this.queryURL = queryURL;
|
||||
}
|
||||
|
||||
public int getTotal() {
|
||||
return total;
|
||||
}
|
||||
|
||||
public void setTotal(final int total) {
|
||||
this.total = total;
|
||||
}
|
||||
|
||||
public int getEndAtPage() {
|
||||
return endAtPage;
|
||||
}
|
||||
|
||||
public void setEndAtPage(final int endAtPage) {
|
||||
this.endAtPage = endAtPage;
|
||||
log.debug("Overriding endAtPage to " + endAtPage);
|
||||
}
|
||||
|
||||
public VTDGen getVg() {
|
||||
return vg;
|
||||
}
|
||||
|
||||
public void setVg(final VTDGen vg) {
|
||||
this.vg = vg;
|
||||
}
|
||||
|
||||
public VTDNav getVn() {
|
||||
return vn;
|
||||
}
|
||||
|
||||
public void setVn(final VTDNav vn) {
|
||||
this.vn = vn;
|
||||
}
|
||||
|
||||
public AutoPilot getAp() {
|
||||
return ap;
|
||||
}
|
||||
|
||||
public void setAp(final AutoPilot ap) {
|
||||
this.ap = ap;
|
||||
}
|
||||
|
||||
public String getNamespaces() {
|
||||
return namespaces;
|
||||
}
|
||||
|
||||
public void setNamespaces(final String namespaces) {
|
||||
this.namespaces = namespaces;
|
||||
}
|
||||
|
||||
public int getStartFromPage() {
|
||||
return startFromPage;
|
||||
}
|
||||
|
||||
public void setStartFromPage(final int startFromPage) {
|
||||
this.startFromPage = startFromPage;
|
||||
log.debug("Overriding startFromPage to " + startFromPage);
|
||||
}
|
||||
|
||||
private class FillProjectList implements Runnable {
|
||||
|
||||
private boolean morePages = true;
|
||||
private int pageNumber = startFromPage;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
String resultPageUrl = "";
|
||||
try {
|
||||
do {
|
||||
resultPageUrl = getNextPageUrl();
|
||||
log.debug("Page: " + resultPageUrl);
|
||||
// clear VGen before processing the next file
|
||||
vg.clear();
|
||||
byte[] bytes = connector.getInputSource(resultPageUrl).getBytes("UTF-8");
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(false);
|
||||
//vg.parseHttpUrl(resultPageUrl, false);
|
||||
initParser();
|
||||
ap.selectXPath("//project");
|
||||
int res;
|
||||
|
||||
while ((res = ap.evalXPath()) != -1) {
|
||||
final String projectHref = vn.toNormalizedString(vn.getAttrVal("href"));
|
||||
Thread t = new Thread(new ParseProject(projectHref));
|
||||
t.setName("Thread for " + res);
|
||||
es.execute(t);
|
||||
}
|
||||
ap.resetXPath();
|
||||
|
||||
} while (morePages);
|
||||
es.shutdown();
|
||||
es.awaitTermination(WAIT_END_SECONDS, TimeUnit.SECONDS);
|
||||
projects.put(TERMINATOR);
|
||||
|
||||
} catch (Throwable e) {
|
||||
log.error("Exception processing " + resultPageUrl + "\n" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private String getNextPageUrl() {
|
||||
String url = queryURL + "?p=" + pageNumber;
|
||||
if (pageNumber == endAtPage) {
|
||||
morePages = false;
|
||||
}
|
||||
pageNumber++;
|
||||
return url;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private class ParseProject implements Runnable {
|
||||
|
||||
VTDNav vn1;
|
||||
VTDGen vg1;
|
||||
private String projectRef;
|
||||
|
||||
public ParseProject(final String projectHref) {
|
||||
projectRef = projectHref;
|
||||
vg1 = new VTDGen();
|
||||
try {
|
||||
byte[] bytes = connector.getInputSource(projectRef).getBytes("UTF-8");
|
||||
vg1.setDoc(bytes);
|
||||
vg1.parse(false);
|
||||
//vg1.parseHttpUrl(projectRef, false);
|
||||
vn1 = vg1.getNav();
|
||||
}catch(Throwable e){
|
||||
log.error("Exception processing " + projectRef + "\n" + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private int projectsUpdate(String attr) throws CollectorServiceException {
|
||||
try {
|
||||
int index = vn1.getAttrVal(attr);
|
||||
if (index != -1) {
|
||||
String d = vn1.toNormalizedString(index);
|
||||
DateTime recordDate = DateTime.parse(d.substring(0, d.indexOf("T")), simpleDateTimeFormatter);
|
||||
// updated or created after the last time it was collected
|
||||
if (recordDate.isAfter(fromDate)) {
|
||||
log.debug("New project to collect");
|
||||
return index;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
return index;
|
||||
} catch (Throwable e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String collectProject() throws CollectorServiceException {
|
||||
try {
|
||||
|
||||
int p = vn1.getAttrVal("href");
|
||||
|
||||
final String projectHref = vn1.toNormalizedString(p);
|
||||
log.debug("collecting project at " + projectHref);
|
||||
|
||||
Gtr2Helper gtr2Helper = new Gtr2Helper();
|
||||
String projectPackage = gtr2Helper.processProject(vn1, namespaces);
|
||||
|
||||
return projectPackage;
|
||||
} catch (Throwable e) {
|
||||
throw new CollectorServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean add(String attr) throws CollectorServiceException {
|
||||
return projectsUpdate(attr) != -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
log.debug("Getting project info from " + projectRef);
|
||||
try {
|
||||
if (!incremental || (incremental && (add("created") || add("updated")))) {
|
||||
projects.put(collectProject());
|
||||
log.debug("Project enqueued " + projectRef);
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
log.error("Error on ParseProject " + e.getMessage());
|
||||
throw new CollectorServiceRuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/**
|
||||
*
|
||||
*/
|
||||
package eu.dnetlib.data.collector.plugins.rest;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
* @author js, Andreas Czerniak
|
||||
*
|
||||
*/
|
||||
public class RestCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(InterfaceDescriptor ifDescriptor, String arg1, String arg2)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = ifDescriptor.getBaseUrl();
|
||||
final String resumptionType = ifDescriptor.getParams().get("resumptionType");
|
||||
final String resumptionParam = ifDescriptor.getParams().get("resumptionParam");
|
||||
final String resumptionXpath = ifDescriptor.getParams().get("resumptionXpath");
|
||||
final String resultTotalXpath = ifDescriptor.getParams().get("resultTotalXpath");
|
||||
final String resultFormatParam = ifDescriptor.getParams().get("resultFormatParam");
|
||||
final String resultFormatValue = ifDescriptor.getParams().get("resultFormatValue");
|
||||
final String resultSizeParam = ifDescriptor.getParams().get("resultSizeParam");
|
||||
final String resultSizeValue = (StringUtils.isBlank(ifDescriptor.getParams().get("resultSizeValue"))) ? "100" : ifDescriptor.getParams().get("resultSizeValue");
|
||||
final String queryParams = ifDescriptor.getParams().get("queryParams");
|
||||
final String entityXpath = ifDescriptor.getParams().get("entityXpath");
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) {throw new CollectorServiceException("Param 'baseUrl' is null or empty");}
|
||||
if (StringUtils.isBlank(resumptionType)) {throw new CollectorServiceException("Param 'resumptionType' is null or empty");}
|
||||
if (StringUtils.isBlank(resumptionParam)) {throw new CollectorServiceException("Param 'resumptionParam' is null or empty");}
|
||||
// if (StringUtils.isBlank(resumptionXpath)) {throw new CollectorServiceException("Param 'resumptionXpath' is null or empty");}
|
||||
// if (StringUtils.isBlank(resultTotalXpath)) {throw new CollectorServiceException("Param 'resultTotalXpath' is null or empty");}
|
||||
// resultFormatParam can be emtpy because some Rest-APIs doesn't like this argument in the query
|
||||
//if (StringUtils.isBlank(resultFormatParam)) {throw new CollectorServiceException("Param 'resultFormatParam' is null, empty or whitespace");}
|
||||
if (StringUtils.isBlank(resultFormatValue)) {throw new CollectorServiceException("Param 'resultFormatValue' is null or empty");}
|
||||
if (StringUtils.isBlank(resultSizeParam)) {throw new CollectorServiceException("Param 'resultSizeParam' is null or empty");}
|
||||
// prevent resumptionType: discover -- if (Integer.valueOf(resultSizeValue) <= 1) {throw new CollectorServiceException("Param 'resultSizeValue' is less than 2");}
|
||||
if (StringUtils.isBlank(queryParams)) {throw new CollectorServiceException("Param 'queryParams' is null or empty");}
|
||||
if (StringUtils.isBlank(entityXpath)) {throw new CollectorServiceException("Param 'entityXpath' is null or empty");}
|
||||
|
||||
return () -> new RestIterator(
|
||||
baseUrl,
|
||||
resumptionType,
|
||||
resumptionParam,
|
||||
resumptionXpath,
|
||||
resultTotalXpath,
|
||||
resultFormatParam,
|
||||
resultFormatValue,
|
||||
resultSizeParam,
|
||||
resultSizeValue,
|
||||
queryParams,
|
||||
entityXpath);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,343 @@
|
|||
/**
|
||||
* log.debug(...) equal to log.trace(...) in the application-logs
|
||||
* <p>
|
||||
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||
*/
|
||||
package eu.dnetlib.data.collector.plugins.rest;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.xpath.*;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.oai.engine.XmlCleaner;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak
|
||||
* @date 2018-09-03
|
||||
*
|
||||
*/
|
||||
public class RestIterator implements Iterator<String> {
|
||||
|
||||
// TODO: clean up the comments of replaced source code
|
||||
private static final Log log = LogFactory.getLog(RestIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private static final String wrapName = "recordWrap";
|
||||
private String baseUrl;
|
||||
private String resumptionType;
|
||||
private String resumptionParam;
|
||||
private String resultFormatValue;
|
||||
private String queryParams;
|
||||
private int resultSizeValue;
|
||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||
private int resultTotal = -1;
|
||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest or token scanned from results)
|
||||
private InputStream resultStream;
|
||||
private Transformer transformer;
|
||||
private XPath xpath;
|
||||
private String query;
|
||||
private XPathExpression xprResultTotalPath;
|
||||
private XPathExpression xprResumptionPath;
|
||||
private XPathExpression xprEntity;
|
||||
private String queryFormat;
|
||||
private String querySize;
|
||||
private Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private int discoverResultSize = 0;
|
||||
private int pagination = 1;
|
||||
|
||||
public RestIterator(
|
||||
final String baseUrl,
|
||||
final String resumptionType,
|
||||
final String resumptionParam,
|
||||
final String resumptionXpath,
|
||||
final String resultTotalXpath,
|
||||
final String resultFormatParam,
|
||||
final String resultFormatValue,
|
||||
final String resultSizeParam,
|
||||
final String resultSizeValueStr,
|
||||
final String queryParams,
|
||||
final String entityXpath
|
||||
) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.resumptionType = resumptionType;
|
||||
this.resumptionParam = resumptionParam;
|
||||
this.resultFormatValue = resultFormatValue;
|
||||
this.queryParams = queryParams;
|
||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
||||
|
||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : "";
|
||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
||||
|
||||
try {
|
||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||
}
|
||||
initQueue();
|
||||
}
|
||||
|
||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
||||
throws TransformerConfigurationException, XPathExpressionException {
|
||||
transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
xpath = XPathFactory.newInstance().newXPath();
|
||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
xprEntity = xpath.compile(entityXpath);
|
||||
}
|
||||
|
||||
private void initQueue() {
|
||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||
}
|
||||
|
||||
private void disconnect() {
|
||||
// TODO close inputstream
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.util.Iterator#hasNext()
|
||||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
||||
disconnect();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.util.Iterator#next()
|
||||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (recordQueue) {
|
||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
||||
try {
|
||||
log.info("get Query: " + query);
|
||||
query = downloadPage(query);
|
||||
log.debug("next queryURL from downloadPage(): " + query);
|
||||
} catch (CollectorServiceException e) {
|
||||
log.debug("CollectorPlugin.next()-Exception: " + e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return recordQueue.poll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* download page and return nextQuery
|
||||
*/
|
||||
private String downloadPage(String query) throws CollectorServiceException {
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
String emptyXml = resultXml + "<" + wrapName + "></" + wrapName + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
|
||||
try {
|
||||
URL qUrl = new URL(query);
|
||||
|
||||
resultStream = qUrl.openStream();
|
||||
if ("json".equals(resultFormatValue.toLowerCase())) {
|
||||
|
||||
resultJson = IOUtils.toString(resultStream, "UTF-8");
|
||||
resultJson = syntaxConvertJsonKeyNamens(resultJson);
|
||||
org.json.JSONObject jsonObject = new org.json.JSONObject(resultJson);
|
||||
resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element
|
||||
log.trace("before inputStream: " + resultXml);
|
||||
resultXml = XmlCleaner.cleanAllEntities(resultXml);
|
||||
log.trace("after cleaning: " + resultXml);
|
||||
resultStream = IOUtils.toInputStream(resultXml, "UTF-8");
|
||||
}
|
||||
|
||||
if (!(emptyXml.toLowerCase()).equals(resultXml.toLowerCase())) {
|
||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: " + nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
StringWriter sw = new StringWriter();
|
||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
recordQueue.add(sw.toString());
|
||||
}
|
||||
} else { log.info("resultXml is equal with emptyXml"); }
|
||||
|
||||
resumptionInt += resultSizeValue;
|
||||
|
||||
switch (resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: discover, Param 'resultSizeValue' is less than 2");}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url: " + urlOldResumptionSize);
|
||||
}
|
||||
}
|
||||
|
||||
if (((emptyXml.toLowerCase()).equals(resultXml.toLowerCase()))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))
|
||||
) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
|
||||
resultTotal = discoverResultSize;
|
||||
} else {
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
resultTotal = resumptionInt + 1;
|
||||
if (nodeList != null) { discoverResultSize += nodeList.getLength(); }
|
||||
}
|
||||
log.info("discoverResultSize: " + discoverResultSize);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over pages
|
||||
pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
resultTotal = discoverResultSize;
|
||||
pagination = discoverResultSize;
|
||||
}
|
||||
resumptionInt = pagination;
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (resultTotal == -1) {
|
||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
||||
if (resumptionType.toLowerCase().equals("page")) { resultTotal += 1; } // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
||||
}
|
||||
log.info("resultTotal: " + resultTotal);
|
||||
log.info("resInt: " + resumptionInt);
|
||||
if (resumptionInt < resultTotal) {
|
||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat;
|
||||
} else
|
||||
nextQuery = "";
|
||||
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to ''
|
||||
* check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names
|
||||
* and work-around for the JSON to XML converting of org.json.XML-package.
|
||||
*
|
||||
* known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"],
|
||||
*
|
||||
* @param jsonInput
|
||||
* @return convertedJsonKeynameOutput
|
||||
*/
|
||||
private String syntaxConvertJsonKeyNamens(String jsonInput) {
|
||||
|
||||
log.trace("before convertJsonKeyNames: " + jsonInput);
|
||||
// pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml)
|
||||
// replace ' 's in JSON Namens with '_'
|
||||
while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":");
|
||||
}
|
||||
|
||||
// replace forward-slash (sign '/' ) in JSON Names with '_'
|
||||
while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":");
|
||||
}
|
||||
|
||||
// replace '(' in JSON Names with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// replace ')' in JSON Names with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// replace startNumbers in JSON Keynames with 'n_'
|
||||
while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":");
|
||||
}
|
||||
|
||||
// replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with ''
|
||||
while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":");
|
||||
}
|
||||
|
||||
// replace ',' in JSON Keynames with '.' to prevent , in xml tagnames.
|
||||
// while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) {
|
||||
// jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":");
|
||||
// }
|
||||
|
||||
// replace '=' in JSON Keynames with '-'
|
||||
while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) {
|
||||
jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":");
|
||||
}
|
||||
|
||||
log.trace("after syntaxConvertJsonKeyNames: " + jsonInput);
|
||||
return jsonInput;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* https://www.w3.org/TR/REC-xml/#charencoding shows character enoding in entities
|
||||
* *
|
||||
* @param bufferStr - XML string
|
||||
* @return
|
||||
*/
|
||||
private static String cleanUnwantedJsonCharsInXmlTagnames(String bufferStr) {
|
||||
|
||||
while (bufferStr.matches(".*<([^<>].*),(.)>.*")) {
|
||||
bufferStr = bufferStr.replaceAll("<([^<>.*),(.*)>", "<$1$2>");
|
||||
}
|
||||
|
||||
// replace [#x10-#x1f] with ''
|
||||
// while (bufferStr.matches(".*[0-9a-f].*")) {
|
||||
// bufferStr = bufferStr.replaceAll("([0-9a-fA-F])", "");
|
||||
// }
|
||||
|
||||
return bufferStr;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,685 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.w3c.dom.Attr;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.StringWriter;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
public class DatasetDocument {
|
||||
private List<Identifier> identifiers;
|
||||
private List<Creator> creators;
|
||||
private List<String> titles;
|
||||
private List<String> alternativeTitles;
|
||||
private List<String> publishers;
|
||||
private List<LocalDate> publicationDates;
|
||||
private List<String> subjects;
|
||||
private List<Contributor> contributors;
|
||||
private List<LocalDate> createdDates;
|
||||
private List<LocalDate> updatedDates;
|
||||
private List<String> languages;
|
||||
private List<ResourceType> resourceTypes;
|
||||
private List<AlternateIdentifier> alternateIdentifier;
|
||||
private List<Citation> citations;
|
||||
private List<String> sizes;
|
||||
private List<String> format;
|
||||
private List<String> version;
|
||||
private List<License> licenses;
|
||||
private List<String> descriptions;
|
||||
private List<String> disambiguatingDescriptions;
|
||||
private List<SpatialCoverage> geoLocations;
|
||||
|
||||
public List<Identifier> getIdentifiers() {
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
public void setIdentifiers(List<Identifier> identifiers) {
|
||||
this.identifiers = identifiers;
|
||||
}
|
||||
|
||||
public List<Creator> getCreators() {
|
||||
return creators;
|
||||
}
|
||||
|
||||
public void setCreators(List<Creator> creators) {
|
||||
this.creators = creators;
|
||||
}
|
||||
|
||||
public List<String> getTitles() {
|
||||
return titles;
|
||||
}
|
||||
|
||||
public void setTitles(List<String> titles) {
|
||||
this.titles = titles;
|
||||
}
|
||||
|
||||
public List<String> getAlternativeTitles() {
|
||||
return alternativeTitles;
|
||||
}
|
||||
|
||||
public void setAlternativeTitles(List<String> alternativeTitles) {
|
||||
this.alternativeTitles = alternativeTitles;
|
||||
}
|
||||
|
||||
public List<String> getPublishers() {
|
||||
return publishers;
|
||||
}
|
||||
|
||||
public void setPublishers(List<String> publishers) {
|
||||
this.publishers = publishers;
|
||||
}
|
||||
|
||||
public List<LocalDate> getPublicationDates() {
|
||||
return publicationDates;
|
||||
}
|
||||
|
||||
public void setPublicationDates(List<LocalDate> publicationDates) {
|
||||
this.publicationDates = publicationDates;
|
||||
}
|
||||
|
||||
public List<String> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<String> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<Contributor> getContributors() {
|
||||
return contributors;
|
||||
}
|
||||
|
||||
public void setContributors(List<Contributor> contributors) {
|
||||
this.contributors = contributors;
|
||||
}
|
||||
|
||||
public List<LocalDate> getCreatedDates() {
|
||||
return createdDates;
|
||||
}
|
||||
|
||||
public void setCreatedDates(List<LocalDate> createdDates) {
|
||||
this.createdDates = createdDates;
|
||||
}
|
||||
|
||||
public List<LocalDate> getUpdatedDates() {
|
||||
return updatedDates;
|
||||
}
|
||||
|
||||
public void setUpdatedDates(List<LocalDate> updatedDates) {
|
||||
this.updatedDates = updatedDates;
|
||||
}
|
||||
|
||||
public List<String> getLanguages() {
|
||||
return languages;
|
||||
}
|
||||
|
||||
public void setLanguages(List<String> languages) {
|
||||
this.languages = languages;
|
||||
}
|
||||
|
||||
public List<ResourceType> getResourceTypes() {
|
||||
return resourceTypes;
|
||||
}
|
||||
|
||||
public void setResourceTypes(List<ResourceType> resourceTypes) {
|
||||
this.resourceTypes = resourceTypes;
|
||||
}
|
||||
|
||||
public List<AlternateIdentifier> getAlternateIdentifier() {
|
||||
return alternateIdentifier;
|
||||
}
|
||||
|
||||
public void setAlternateIdentifier(List<AlternateIdentifier> alternateIdentifier) {
|
||||
this.alternateIdentifier = alternateIdentifier;
|
||||
}
|
||||
|
||||
public List<Citation> getCitations() {
|
||||
return citations;
|
||||
}
|
||||
|
||||
public void setCitations(List<Citation> citations) {
|
||||
this.citations = citations;
|
||||
}
|
||||
|
||||
public List<String> getSizes() {
|
||||
return sizes;
|
||||
}
|
||||
|
||||
public void setSizes(List<String> sizes) {
|
||||
this.sizes = sizes;
|
||||
}
|
||||
|
||||
public List<String> getFormat() {
|
||||
return format;
|
||||
}
|
||||
|
||||
public void setFormat(List<String> format) {
|
||||
this.format = format;
|
||||
}
|
||||
|
||||
public List<String> getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(List<String> version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public List<License> getLicenses() {
|
||||
return licenses;
|
||||
}
|
||||
|
||||
public void setLicenses(List<License> licenses) {
|
||||
this.licenses = licenses;
|
||||
}
|
||||
|
||||
public List<String> getDescriptions() {
|
||||
return descriptions;
|
||||
}
|
||||
|
||||
public void setDescriptions(List<String> descriptions) {
|
||||
this.descriptions = descriptions;
|
||||
}
|
||||
|
||||
public List<String> getDisambiguatingDescriptions() {
|
||||
return disambiguatingDescriptions;
|
||||
}
|
||||
|
||||
public void setDisambiguatingDescriptions(List<String> disambiguatingDescriptions) {
|
||||
this.disambiguatingDescriptions = disambiguatingDescriptions;
|
||||
}
|
||||
|
||||
public List<SpatialCoverage> getGeoLocations() {
|
||||
return geoLocations;
|
||||
}
|
||||
|
||||
public void setGeoLocations(List<SpatialCoverage> geoLocations) {
|
||||
this.geoLocations = geoLocations;
|
||||
}
|
||||
|
||||
private static String emptyXml;
|
||||
private static Object lockEmptyXml = new Object();
|
||||
public static String emptyXml() {
|
||||
if(DatasetDocument.emptyXml!=null) return DatasetDocument.emptyXml;
|
||||
|
||||
String xml = null;
|
||||
try {
|
||||
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
|
||||
Document doc = docBuilder.newDocument();
|
||||
|
||||
Element root = doc.createElement("dataset");
|
||||
doc.appendChild(root);
|
||||
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer transformer = tf.newTransformer();
|
||||
StringWriter writer = new StringWriter();
|
||||
transformer.transform(new DOMSource(doc), new StreamResult(writer));
|
||||
xml = writer.getBuffer().toString();
|
||||
}catch(Exception ex){
|
||||
xml = "<dataset/>";
|
||||
}
|
||||
|
||||
synchronized (DatasetDocument.lockEmptyXml) {
|
||||
if (DatasetDocument.emptyXml == null) DatasetDocument.emptyXml = xml;
|
||||
}
|
||||
|
||||
return DatasetDocument.emptyXml;
|
||||
}
|
||||
|
||||
public String toXml() throws Exception {
|
||||
DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
|
||||
Document doc = docBuilder.newDocument();
|
||||
|
||||
Element root = doc.createElement("dataset");
|
||||
doc.appendChild(root);
|
||||
|
||||
if(this.identifiers!=null){
|
||||
for(Identifier item : this.identifiers){
|
||||
item.toXml(root);
|
||||
}
|
||||
}
|
||||
if(this.creators!=null){
|
||||
Element creators = doc.createElement("creators");
|
||||
root.appendChild(creators);
|
||||
for(Creator item : this.creators){
|
||||
item.toXml(creators);
|
||||
}
|
||||
}
|
||||
if(this.titles!=null || this.alternativeTitles!=null){
|
||||
Element titles = doc.createElement("titles");
|
||||
root.appendChild(titles);
|
||||
if(this.titles!=null) {
|
||||
for (String item : this.titles) {
|
||||
Element title = doc.createElement("title");
|
||||
titles.appendChild(title);
|
||||
title.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.alternativeTitles!=null) {
|
||||
for (String item : this.alternativeTitles) {
|
||||
Element title = doc.createElement("title");
|
||||
titles.appendChild(title);
|
||||
title.setAttribute("titleType", "AlternativeTitle");
|
||||
title.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
}
|
||||
if(this.publishers!=null){
|
||||
for(String item : this.publishers){
|
||||
Element publisher = doc.createElement("publisher");
|
||||
root.appendChild(publisher);
|
||||
publisher.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.publicationDates!=null){
|
||||
for(LocalDate item : this.publicationDates){
|
||||
Element publicationYear = doc.createElement("publicationYear");
|
||||
root.appendChild(publicationYear);
|
||||
publicationYear.appendChild(doc.createTextNode(Integer.toString(item.getYear())));
|
||||
}
|
||||
}
|
||||
if(this.subjects!=null){
|
||||
Element subjects = doc.createElement("subjects");
|
||||
root.appendChild(subjects);
|
||||
for(String item : this.subjects){
|
||||
Element subject = doc.createElement("subject");
|
||||
subjects.appendChild(subject);
|
||||
subject.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.contributors!=null){
|
||||
for(Contributor item : this.contributors){
|
||||
item.toXml(root);
|
||||
}
|
||||
}
|
||||
if(this.createdDates!=null || this.updatedDates!=null){
|
||||
Element dates = doc.createElement("dates");
|
||||
root.appendChild(dates);
|
||||
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("YYYY-MM-DD");
|
||||
|
||||
if(createdDates!=null) {
|
||||
for (LocalDate item : this.createdDates) {
|
||||
Element date = doc.createElement("date");
|
||||
root.appendChild(date);
|
||||
date.setAttribute("dateType", "Created");
|
||||
date.appendChild(doc.createTextNode(item.format(formatter)));
|
||||
}
|
||||
}
|
||||
if(updatedDates!=null) {
|
||||
for (LocalDate item : this.updatedDates) {
|
||||
Element date = doc.createElement("date");
|
||||
root.appendChild(date);
|
||||
date.setAttribute("dateType", "Updated");
|
||||
date.appendChild(doc.createTextNode(item.format(formatter)));
|
||||
}
|
||||
}
|
||||
}
|
||||
if(this.languages!=null){
|
||||
for(String item : this.languages){
|
||||
Element language = doc.createElement("language");
|
||||
root.appendChild(language);
|
||||
language.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.resourceTypes!=null){
|
||||
for(ResourceType item : this.resourceTypes){
|
||||
item.toXml(root);
|
||||
}
|
||||
}
|
||||
if(this.alternateIdentifier!=null){
|
||||
Element alternateIdentifiers = doc.createElement("alternateIdentifiers");
|
||||
root.appendChild(alternateIdentifiers);
|
||||
for(AlternateIdentifier item : this.alternateIdentifier){
|
||||
item.toXml(alternateIdentifiers);
|
||||
}
|
||||
}
|
||||
if(this.citations!=null){
|
||||
for(Citation item : this.citations){
|
||||
item.toXml(root);
|
||||
}
|
||||
}
|
||||
if(this.sizes!=null){
|
||||
Element sizes = doc.createElement("sizes");
|
||||
root.appendChild(sizes);
|
||||
for(String item : this.sizes){
|
||||
Element size = doc.createElement("size");
|
||||
sizes.appendChild(size);
|
||||
size.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.format!=null){
|
||||
Element formats = doc.createElement("formats");
|
||||
root.appendChild(formats);
|
||||
for(String item : this.format){
|
||||
Element format = doc.createElement("format");
|
||||
formats.appendChild(format);
|
||||
format.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.version!=null){
|
||||
for(String item : this.version){
|
||||
Element version = doc.createElement("version");
|
||||
root.appendChild(version);
|
||||
version.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.licenses!=null){
|
||||
Element rightsList = doc.createElement("rightsList");
|
||||
root.appendChild(rightsList);
|
||||
for(License item : this.licenses){
|
||||
item.toXml(rightsList);
|
||||
}
|
||||
}
|
||||
if(this.descriptions!=null || this.disambiguatingDescriptions!=null){
|
||||
Element descriptions = doc.createElement("descriptions");
|
||||
root.appendChild(descriptions);
|
||||
if(this.descriptions!=null) {
|
||||
for (String item : this.descriptions) {
|
||||
Element description = doc.createElement("description");
|
||||
descriptions.appendChild(description);
|
||||
description.setAttribute("descriptionType", "Abstract");
|
||||
description.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
if(this.disambiguatingDescriptions!=null) {
|
||||
for (String item : this.disambiguatingDescriptions) {
|
||||
Element description = doc.createElement("description");
|
||||
descriptions.appendChild(description);
|
||||
description.setAttribute("descriptionType", "Other");
|
||||
description.appendChild(doc.createTextNode(item));
|
||||
}
|
||||
}
|
||||
}
|
||||
if(this.geoLocations!=null){
|
||||
Element geoLocations = doc.createElement("geoLocations");
|
||||
root.appendChild(geoLocations);
|
||||
for(SpatialCoverage item : this.geoLocations){
|
||||
item.toXml(geoLocations);
|
||||
}
|
||||
}
|
||||
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer transformer = tf.newTransformer();
|
||||
StringWriter writer = new StringWriter();
|
||||
transformer.transform(new DOMSource(doc), new StreamResult(writer));
|
||||
String xml = writer.getBuffer().toString();
|
||||
return xml;
|
||||
}
|
||||
|
||||
public static class SpatialCoverage{
|
||||
public static class Point{
|
||||
public String latitude;
|
||||
public String longitude;
|
||||
|
||||
public Point() {}
|
||||
|
||||
public Point(String latitude, String longitude){
|
||||
this.latitude = latitude;
|
||||
this.longitude = longitude;
|
||||
}
|
||||
}
|
||||
public String name;
|
||||
public List<Point> points;
|
||||
public List<String> boxes;
|
||||
|
||||
public SpatialCoverage() {}
|
||||
|
||||
public SpatialCoverage(String name, List<Point> points, List<String> boxes ) {
|
||||
this.name = name;
|
||||
this.points = points;
|
||||
this.boxes = boxes;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("geoLocation");
|
||||
parent.appendChild(node);
|
||||
|
||||
if(this.points!=null) {
|
||||
for(Point point : this.points) {
|
||||
if(point.latitude == null || point.longitude == null) continue;
|
||||
Element geoLocationPoint = parent.getOwnerDocument().createElement("geoLocationPoint");
|
||||
geoLocationPoint.appendChild(parent.getOwnerDocument().createTextNode(String.format("%s %s", point.latitude, point.longitude)));
|
||||
node.appendChild(geoLocationPoint);
|
||||
}
|
||||
}
|
||||
if(this.boxes!=null) {
|
||||
for(String box : this.boxes) {
|
||||
if(box == null) continue;
|
||||
Element geoLocationBox = parent.getOwnerDocument().createElement("geoLocationBox");
|
||||
geoLocationBox.appendChild(parent.getOwnerDocument().createTextNode(box));
|
||||
node.appendChild(geoLocationBox);
|
||||
}
|
||||
}
|
||||
if(this.name!=null) {
|
||||
Element geoLocationPlace = parent.getOwnerDocument().createElement("geoLocationPlace");
|
||||
geoLocationPlace.appendChild(parent.getOwnerDocument().createTextNode(this.name));
|
||||
node.appendChild(geoLocationPlace);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class License{
|
||||
public String name;
|
||||
public String url;
|
||||
|
||||
public License() {}
|
||||
|
||||
public License(String name, String url) {
|
||||
this.name = name;
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("rights");
|
||||
parent.appendChild(node);
|
||||
|
||||
if(this.url!=null) {
|
||||
node.setAttribute("rightsURI", this.url);
|
||||
}
|
||||
if(this.name!=null) {
|
||||
node.appendChild(parent.getOwnerDocument().createTextNode(this.name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class Citation{
|
||||
public enum CitationIdentifierType{
|
||||
ARK, arXiv, bibcode, DOI, EAN13, EISSN, Handle, ISBN, ISSN, ISTC, LISSN, LSID, PMID,
|
||||
PURL, UPC, URL, URN
|
||||
}
|
||||
|
||||
public CitationIdentifierType type;
|
||||
public String value;
|
||||
|
||||
public Citation() {}
|
||||
|
||||
public Citation(String value, CitationIdentifierType type) {
|
||||
this.value = value;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("relatedIdentifier");
|
||||
parent.appendChild(node);
|
||||
|
||||
node.setAttribute("relatedIdentifierType", this.type.toString());
|
||||
node.setAttribute("relationType", "Cites");
|
||||
node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
|
||||
}
|
||||
}
|
||||
|
||||
public static class Contributor{
|
||||
public enum ContributorType{
|
||||
ContactPerson, DataCollector, DataCurator, DataManager, Distributor, Editor, Funder, HostingInstitution,
|
||||
Producer, ProjectLeader, ProjectManager, ProjectMember, RegistrationAgency, RegistrationAuthority,
|
||||
RelatedPerson, Researcher, ResearchGroup, RightsHolder, Sponsor, Supervisor, WorkPackageLeader, Other
|
||||
}
|
||||
|
||||
public String name;
|
||||
public List<String> affiliations;
|
||||
public ContributorType type;
|
||||
|
||||
public Contributor() {
|
||||
}
|
||||
|
||||
public Contributor(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public Contributor(String name, List<String> affiliations) {
|
||||
this.name = name;
|
||||
this.affiliations = affiliations;
|
||||
}
|
||||
|
||||
public Contributor(String name, List<String> affiliations, ContributorType type) {
|
||||
this.name = name;
|
||||
this.affiliations = affiliations;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("contributor");
|
||||
parent.appendChild(node);
|
||||
|
||||
node.setAttribute("contributorType", this.type.toString());
|
||||
|
||||
if(this.name!=null) {
|
||||
Element contributorName = parent.getOwnerDocument().createElement("contributorName");
|
||||
node.appendChild(contributorName);
|
||||
contributorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
|
||||
}
|
||||
if(this.affiliations!=null) {
|
||||
for(String item : this.affiliations) {
|
||||
Element affiliation = parent.getOwnerDocument().createElement("affiliation");
|
||||
node.appendChild(affiliation);
|
||||
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class AlternateIdentifier{
|
||||
public String identifier;
|
||||
public String type;
|
||||
|
||||
public AlternateIdentifier() {}
|
||||
|
||||
public AlternateIdentifier(String identifier, String type) {
|
||||
this.identifier = identifier;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("alternateIdentifier");
|
||||
parent.appendChild(node);
|
||||
|
||||
if(this.type!=null) {
|
||||
node.setAttribute("alternateIdentifierType", this.type);
|
||||
}
|
||||
if(this.identifier!=null) {
|
||||
node.appendChild(parent.getOwnerDocument().createTextNode(this.identifier));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class ResourceType{
|
||||
public enum ResourceTypeGeneralType {
|
||||
Audiovisual, Collection, Dataset, Event, Image, InteractiveResource, Model, PhysicalObject, Service,
|
||||
Software, Sound, Text, Workflow, Other
|
||||
}
|
||||
|
||||
public ResourceTypeGeneralType type;
|
||||
|
||||
public ResourceType() {}
|
||||
|
||||
public ResourceType(ResourceTypeGeneralType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("resourceType");
|
||||
parent.appendChild(node);
|
||||
|
||||
if(this.type!=null) {
|
||||
node.setAttribute("resourceTypeGeneral", this.type.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class Creator {
|
||||
public String name;
|
||||
public List<String> affiliations;
|
||||
|
||||
public Creator() {
|
||||
}
|
||||
|
||||
public Creator(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public Creator(String name, List<String> affiliations) {
|
||||
this.name = name;
|
||||
this.affiliations = affiliations;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("creator");
|
||||
parent.appendChild(node);
|
||||
|
||||
if(this.name!=null) {
|
||||
Element creatorName = parent.getOwnerDocument().createElement("creatorName");
|
||||
node.appendChild(creatorName);
|
||||
creatorName.appendChild(parent.getOwnerDocument().createTextNode(this.name));
|
||||
}
|
||||
if(this.affiliations!=null) {
|
||||
for(String item : this.affiliations) {
|
||||
Element affiliation = parent.getOwnerDocument().createElement("affiliation");
|
||||
node.appendChild(affiliation);
|
||||
affiliation.appendChild(parent.getOwnerDocument().createTextNode(item));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class Identifier {
|
||||
public enum IdentifierType {
|
||||
ARK, DOI, Handle, PURL, URN, URL
|
||||
}
|
||||
|
||||
public String value;
|
||||
public IdentifierType type;
|
||||
|
||||
public Identifier() {
|
||||
}
|
||||
|
||||
public Identifier(IdentifierType type, String value) {
|
||||
this.type = type;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public void toXml(Element parent){
|
||||
Element node = parent.getOwnerDocument().createElement("identifier");
|
||||
parent.appendChild(node);
|
||||
|
||||
node.setAttribute("identifierType", this.type.toString());
|
||||
if(this.value!=null) {
|
||||
node.appendChild(parent.getOwnerDocument().createTextNode(this.value));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,514 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.net.URL;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
|
||||
public class DatasetMappingIterator implements Iterator<String> {
|
||||
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
|
||||
|
||||
public static class Options {
|
||||
public static class IdentifierOptions{
|
||||
public List<String> mappingARK;
|
||||
public List<String> mappingDOI;
|
||||
public List<String> mappingHandle;
|
||||
public List<String> mappingPURL;
|
||||
public List<String> mappingURN;
|
||||
public List<String> mappingURL;
|
||||
public DatasetDocument.Identifier.IdentifierType fallbackType;
|
||||
public Boolean fallbackURL;
|
||||
}
|
||||
|
||||
public static class ContributorOptions{
|
||||
public DatasetDocument.Contributor.ContributorType fallbackType;
|
||||
}
|
||||
|
||||
public static class PublicationDateOptions{
|
||||
public String format;
|
||||
}
|
||||
|
||||
public static class CreatedDateOptions{
|
||||
public String format;
|
||||
}
|
||||
|
||||
public static class UpdatedDateOptions{
|
||||
public String format;
|
||||
}
|
||||
|
||||
private IdentifierOptions identifierOptions;
|
||||
private PublicationDateOptions publicationDateOptions;
|
||||
private ContributorOptions contributorOptions;
|
||||
private CreatedDateOptions createdDateOptions;
|
||||
private UpdatedDateOptions updatedDateOptions;
|
||||
|
||||
public UpdatedDateOptions getUpdatedDateOptions() {
|
||||
return updatedDateOptions;
|
||||
}
|
||||
|
||||
public void setUpdatedDateOptions(UpdatedDateOptions updatedDateOptions) {
|
||||
this.updatedDateOptions = updatedDateOptions;
|
||||
}
|
||||
|
||||
public CreatedDateOptions getCreatedDateOptions() {
|
||||
return createdDateOptions;
|
||||
}
|
||||
|
||||
public void setCreatedDateOptions(CreatedDateOptions createdDateOptions) {
|
||||
this.createdDateOptions = createdDateOptions;
|
||||
}
|
||||
|
||||
public ContributorOptions getContributorOptions() {
|
||||
return contributorOptions;
|
||||
}
|
||||
|
||||
public void setContributorOptions(ContributorOptions contributorOptions) {
|
||||
this.contributorOptions = contributorOptions;
|
||||
}
|
||||
|
||||
public PublicationDateOptions getPublicationDateOptions() {
|
||||
return publicationDateOptions;
|
||||
}
|
||||
|
||||
public void setPublicationDateOptions(PublicationDateOptions publicationDateOptions) {
|
||||
this.publicationDateOptions = publicationDateOptions;
|
||||
}
|
||||
|
||||
public IdentifierOptions getIdentifierOptions() {
|
||||
return identifierOptions;
|
||||
}
|
||||
|
||||
public void setIdentifierOptions(IdentifierOptions identifierOptions) {
|
||||
this.identifierOptions = identifierOptions;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private EndpointAccessIterator endpointAccessIterator;
|
||||
|
||||
public DatasetMappingIterator(Options options, EndpointAccessIterator endpointAccessIterator) {
|
||||
this.options = options;
|
||||
this.endpointAccessIterator = endpointAccessIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.endpointAccessIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
JSONObject document = this.endpointAccessIterator.next();
|
||||
String xml = null;
|
||||
if (document == null) {
|
||||
log.debug("no document provided to process. returning empty");
|
||||
xml = DatasetDocument.emptyXml();
|
||||
}
|
||||
else {
|
||||
log.debug("building document");
|
||||
xml = this.buildDataset(document);
|
||||
if (!Utils.validateXml(xml)) {
|
||||
log.debug("xml not valid. setting to empty");
|
||||
xml = null;
|
||||
}
|
||||
if (xml == null) {
|
||||
log.debug("could not build xml. returning empty");
|
||||
xml = DatasetDocument.emptyXml();
|
||||
}
|
||||
}
|
||||
|
||||
//if all else fails
|
||||
if(xml == null){
|
||||
log.debug("could not build xml. returning empty");
|
||||
xml = "<dataset/>";
|
||||
}
|
||||
|
||||
log.debug("xml document for dataset is: "+xml);
|
||||
|
||||
return xml;
|
||||
}
|
||||
|
||||
private String buildDataset(JSONObject document){
|
||||
String xml = null;
|
||||
try{
|
||||
DatasetDocument dataset = new DatasetDocument();
|
||||
|
||||
dataset.setIdentifiers(this.extractIdentifier(document));
|
||||
dataset.setCreators(this.extractCreator(document));
|
||||
dataset.setTitles(this.extractTitles(document));
|
||||
dataset.setAlternativeTitles(this.extractAlternateTitles(document));
|
||||
dataset.setPublishers(this.extractPublisher(document));
|
||||
dataset.setPublicationDates(this.extractPublicationDate(document));
|
||||
dataset.setSubjects(this.extractSubjects(document));
|
||||
dataset.setContributors(this.extractContributors(document));
|
||||
dataset.setCreatedDates(this.extractCreatedDate(document));
|
||||
dataset.setUpdatedDates(this.extractUpdatedDate(document));
|
||||
dataset.setLanguages(this.extractLanguages(document));
|
||||
dataset.setResourceTypes(this.extractResourceTypes(document));
|
||||
dataset.setAlternateIdentifier(this.extractAlternateIdentifiers(document));
|
||||
dataset.setCitations(this.extractCitations(document));
|
||||
dataset.setSizes(this.extractSize(document));
|
||||
dataset.setFormat(this.extractEncodingFormat(document));
|
||||
dataset.setVersion(this.extractVersion(document));
|
||||
dataset.setLicenses(this.extractLicense(document));
|
||||
dataset.setDescriptions(this.extractDescription(document));
|
||||
dataset.setDisambiguatingDescriptions(this.extractDisambiguatingDescription(document));
|
||||
dataset.setGeoLocations(this.extractSpatialCoverage(document));
|
||||
|
||||
log.debug("document contains native identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
|
||||
|
||||
if((dataset.getIdentifiers() == null || dataset.getIdentifiers().size() == 0) &&
|
||||
this.options.getIdentifierOptions().fallbackURL){
|
||||
log.debug("falling back to url identifier");
|
||||
dataset.setIdentifiers(this.extractIdentifierFallbackURL(document));
|
||||
log.debug("document contains overridden identifier: : "+(dataset.getIdentifiers()!=null && dataset.getIdentifiers().size() > 0));
|
||||
}
|
||||
|
||||
xml = dataset.toXml();
|
||||
}
|
||||
catch(Exception ex){
|
||||
log.error("problem constructing dataset xml. returning empty", ex);
|
||||
xml = null;
|
||||
}
|
||||
return xml;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.Identifier> extractIdentifierFallbackURL(JSONObject document){
|
||||
List<String> urls = JSONLDUtils.extractString(document, "url");
|
||||
|
||||
ArrayList<DatasetDocument.Identifier> curated = new ArrayList<>();
|
||||
for(String item : urls){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.Identifier(DatasetDocument.Identifier.IdentifierType.URL, item.trim()));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.SpatialCoverage> extractSpatialCoverage(JSONObject document){
|
||||
List<JSONLDUtils.PlaceInfo> spatials = JSONLDUtils.extractPlaces(document, "spatialCoverage");
|
||||
|
||||
ArrayList<DatasetDocument.SpatialCoverage> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.PlaceInfo item : spatials){
|
||||
if((item.name == null || item.name.trim().length() == 0) &&
|
||||
(item.geoCoordinates == null || item.geoCoordinates.size() == 0) &&
|
||||
(item.geoShapes == null || item.geoShapes.size() == 0)) continue;
|
||||
|
||||
List<DatasetDocument.SpatialCoverage.Point> points = new ArrayList<>();
|
||||
List<String> boxes = new ArrayList<>();
|
||||
if(item.geoCoordinates!=null) {
|
||||
for (JSONLDUtils.GeoCoordinatesInfo iter : item.geoCoordinates){
|
||||
points.add(new DatasetDocument.SpatialCoverage.Point(iter.latitude, iter.longitude));
|
||||
}
|
||||
}
|
||||
if(item.geoShapes!=null) {
|
||||
for (JSONLDUtils.GeoShapeInfo iter : item.geoShapes){
|
||||
boxes.add(iter.box);
|
||||
}
|
||||
}
|
||||
curated.add(new DatasetDocument.SpatialCoverage(item.name, points, boxes));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractDescription(JSONObject document){
|
||||
List<String> descriptions = JSONLDUtils.extractString(document, "description");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(String item : descriptions){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractDisambiguatingDescription(JSONObject document){
|
||||
List<String> descriptions = JSONLDUtils.extractString(document, "disambiguatingDescription");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(String item : descriptions){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.License> extractLicense(JSONObject document){
|
||||
List<JSONLDUtils.LicenseInfo> licenses = JSONLDUtils.extractLicenses(document, "license");
|
||||
|
||||
ArrayList<DatasetDocument.License> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.LicenseInfo item : licenses){
|
||||
if(item.url == null || item.url.trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.License(item.name, item.url));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractVersion(JSONObject document){
|
||||
List<String> versions = JSONLDUtils.extractString(document, "version");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(String item : versions){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractSize(JSONObject document) {
|
||||
List<String> sizes = JSONLDUtils.extractSize(document, "distribution");
|
||||
|
||||
HashSet<String> curated = new HashSet<>();
|
||||
for (String item : sizes) {
|
||||
if (item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return new ArrayList<>(curated);
|
||||
}
|
||||
|
||||
private List<String> extractEncodingFormat(JSONObject document){
|
||||
List<String> formats = JSONLDUtils.extractEncodingFormat(document, "distribution");
|
||||
|
||||
HashSet<String> curated = new HashSet<>();
|
||||
for(String item : formats){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return new ArrayList<>(curated);
|
||||
}
|
||||
|
||||
//TODO: Handle different citation types. Currently only urls
|
||||
private List<DatasetDocument.Citation> extractCitations(JSONObject document){
|
||||
List<JSONLDUtils.CitationInfo> citations = JSONLDUtils.extractCitations(document, "citation");
|
||||
|
||||
ArrayList<DatasetDocument.Citation> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.CitationInfo item : citations){
|
||||
if(item.url == null || item.url.trim().length() == 0) continue;
|
||||
try{
|
||||
new URL(item.url);
|
||||
}catch (Exception ex){
|
||||
continue;
|
||||
}
|
||||
curated.add(new DatasetDocument.Citation(item.url, DatasetDocument.Citation.CitationIdentifierType.URL));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.AlternateIdentifier> extractAlternateIdentifiers(JSONObject document){
|
||||
List<String> issns = JSONLDUtils.extractString(document, "issn");
|
||||
List<String> urls = JSONLDUtils.extractString(document, "url");
|
||||
|
||||
ArrayList<DatasetDocument.AlternateIdentifier> curated = new ArrayList<>();
|
||||
for(String item : issns){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "ISSN"));
|
||||
}
|
||||
for(String item : urls){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.AlternateIdentifier(item.trim(), "URL"));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.ResourceType> extractResourceTypes(JSONObject document){
|
||||
List<DatasetDocument.ResourceType> resourceTypes = new ArrayList<>();
|
||||
resourceTypes.add(new DatasetDocument.ResourceType(DatasetDocument.ResourceType.ResourceTypeGeneralType.Dataset));
|
||||
return resourceTypes;
|
||||
}
|
||||
|
||||
private List<String> extractLanguages(JSONObject document){
|
||||
List<String> languages = JSONLDUtils.extractLanguage(document, "inLanguage");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(String item : languages){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<LocalDate> extractUpdatedDate(JSONObject document){
|
||||
List<LocalDate> updatedDates = new ArrayList<>();
|
||||
if(this.options.getUpdatedDateOptions() == null || this.options.getUpdatedDateOptions().format == null || this.options.getUpdatedDateOptions().format.length() == 0) return updatedDates;
|
||||
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
|
||||
|
||||
List<String> dates = JSONLDUtils.extractString(document, "dateModified");
|
||||
for(String updatedDate : dates){
|
||||
if(updatedDate == null || updatedDate.trim().length() == 0) continue;
|
||||
try {
|
||||
LocalDate localDate = LocalDate.parse(updatedDate, formatter);
|
||||
updatedDates.add(localDate);
|
||||
} catch (Exception e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return updatedDates;
|
||||
}
|
||||
|
||||
private List<LocalDate> extractCreatedDate(JSONObject document){
|
||||
List<LocalDate> createdDates = new ArrayList<>();
|
||||
if(this.options.getCreatedDateOptions() == null || this.options.getCreatedDateOptions().format == null || this.options.getCreatedDateOptions().format.length() == 0) return createdDates;
|
||||
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getCreatedDateOptions().format);
|
||||
|
||||
List<String> dates = JSONLDUtils.extractString(document, "dateCreated");
|
||||
for(String createdDate : dates){
|
||||
if(createdDate == null || createdDate.trim().length() == 0) continue;
|
||||
try {
|
||||
LocalDate localDate = LocalDate.parse(createdDate, formatter);
|
||||
createdDates.add(localDate);
|
||||
} catch (Exception e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return createdDates;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.Contributor> extractContributors(JSONObject document){
|
||||
List<JSONLDUtils.PrincipalInfo> editors = JSONLDUtils.extractPrincipal(document, "editor");
|
||||
List<JSONLDUtils.PrincipalInfo> funders = JSONLDUtils.extractPrincipal(document, "funder");
|
||||
List<JSONLDUtils.PrincipalInfo> producers = JSONLDUtils.extractPrincipal(document, "producer");
|
||||
List<JSONLDUtils.PrincipalInfo> sponsors = JSONLDUtils.extractPrincipal(document, "sponsor");
|
||||
List<JSONLDUtils.PrincipalInfo> constributors = JSONLDUtils.extractPrincipal(document, "contributor");
|
||||
|
||||
ArrayList<DatasetDocument.Contributor> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.PrincipalInfo item : editors){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Editor));
|
||||
}
|
||||
for(JSONLDUtils.PrincipalInfo item : funders){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Funder));
|
||||
}
|
||||
for(JSONLDUtils.PrincipalInfo item : producers){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Producer));
|
||||
}
|
||||
for(JSONLDUtils.PrincipalInfo item : sponsors){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), DatasetDocument.Contributor.ContributorType.Sponsor));
|
||||
}
|
||||
for(JSONLDUtils.PrincipalInfo item : constributors){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
DatasetDocument.Contributor.ContributorType type = DatasetDocument.Contributor.ContributorType.Other;
|
||||
if(this.options.getContributorOptions()!=null && this.options.getContributorOptions().fallbackType != null) type = this.options.getContributorOptions().fallbackType;
|
||||
curated.add(new DatasetDocument.Contributor(item.name(), item.affiliationNames(), type));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractSubjects(JSONObject document){
|
||||
List<String> subjects = JSONLDUtils.extractString(document, "keywords");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(String item : subjects){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item);
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<LocalDate> extractPublicationDate(JSONObject document){
|
||||
List<LocalDate> publicationDates = new ArrayList<>();
|
||||
if(this.options.getPublicationDateOptions() == null || this.options.getPublicationDateOptions().format == null || this.options.getPublicationDateOptions().format.length() == 0) return publicationDates;
|
||||
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern(this.options.getPublicationDateOptions().format);
|
||||
|
||||
List<String> dates = JSONLDUtils.extractString(document, "datePublished");
|
||||
for(String publicationDate : dates){
|
||||
if(publicationDate == null || publicationDate.trim().length() == 0) continue;
|
||||
try {
|
||||
LocalDate localDate = LocalDate.parse(publicationDate, formatter);
|
||||
publicationDates.add(localDate);
|
||||
} catch (Exception e) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return publicationDates;
|
||||
}
|
||||
|
||||
private List<String> extractPublisher(JSONObject document){
|
||||
List<JSONLDUtils.PrincipalInfo> publishers = JSONLDUtils.extractPrincipal(document, "publisher");
|
||||
|
||||
ArrayList<String> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.PrincipalInfo item : publishers){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
curated.add(item.name());
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<String> extractTitles(JSONObject document){
|
||||
List<String> names = JSONLDUtils.extractString(document, "name");
|
||||
List<String> headlines = JSONLDUtils.extractString(document, "headline");
|
||||
|
||||
HashSet<String> titles = new HashSet<>();
|
||||
titles.addAll(names);
|
||||
titles.addAll(headlines);
|
||||
return new ArrayList<>(titles);
|
||||
}
|
||||
|
||||
private List<String> extractAlternateTitles(JSONObject document){
|
||||
List<String> names = JSONLDUtils.extractString(document, "alternateName");
|
||||
List<String> headlines = JSONLDUtils.extractString(document, "alternativeHeadline");
|
||||
|
||||
HashSet<String> titles = new HashSet<>();
|
||||
titles.addAll(names);
|
||||
titles.addAll(headlines);
|
||||
return new ArrayList<>(titles);
|
||||
}
|
||||
|
||||
private List<DatasetDocument.Identifier> extractIdentifier(JSONObject document){
|
||||
List<DatasetDocument.Identifier> curated = new ArrayList<>();
|
||||
|
||||
List<JSONLDUtils.IdentifierInfo> identifiers = JSONLDUtils.extractIdentifier(document, "identifier");
|
||||
|
||||
for(JSONLDUtils.IdentifierInfo item : identifiers){
|
||||
if(item.value == null || item.value.trim().length() == 0) continue;
|
||||
if(item.type == null || item.type.trim().length() == 0) {
|
||||
if (this.options.getIdentifierOptions().fallbackType == null) continue;
|
||||
curated.add(new DatasetDocument.Identifier(this.options.getIdentifierOptions().fallbackType, item.value.trim()));
|
||||
}
|
||||
else {
|
||||
DatasetDocument.Identifier.IdentifierType type = null;
|
||||
if(this.options.getIdentifierOptions().mappingARK != null && this.options.getIdentifierOptions().mappingARK.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.ARK;
|
||||
else if(this.options.getIdentifierOptions().mappingDOI != null && this.options.getIdentifierOptions().mappingDOI.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.DOI;
|
||||
else if(this.options.getIdentifierOptions().mappingHandle != null && this.options.getIdentifierOptions().mappingHandle.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.Handle;
|
||||
else if(this.options.getIdentifierOptions().mappingPURL != null && this.options.getIdentifierOptions().mappingPURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.PURL;
|
||||
else if(this.options.getIdentifierOptions().mappingURL != null && this.options.getIdentifierOptions().mappingURL.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URL;
|
||||
else if(this.options.getIdentifierOptions().mappingURN != null && this.options.getIdentifierOptions().mappingURN.contains(item.type.trim())) type = DatasetDocument.Identifier.IdentifierType.URN;
|
||||
|
||||
if(type == null) continue;
|
||||
curated.add(new DatasetDocument.Identifier(type, item.value.trim()));
|
||||
}
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
private List<DatasetDocument.Creator> extractCreator(JSONObject document){
|
||||
List<JSONLDUtils.PrincipalInfo> creators = JSONLDUtils.extractPrincipal(document, "creator");
|
||||
List<JSONLDUtils.PrincipalInfo> authors = JSONLDUtils.extractPrincipal(document, "author");
|
||||
|
||||
HashSet<String> foundNames = new HashSet<>();
|
||||
List<DatasetDocument.Creator> curated = new ArrayList<>();
|
||||
for(JSONLDUtils.PrincipalInfo item : creators){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
if(foundNames.contains(item.name())) continue;
|
||||
foundNames.add(item.name());
|
||||
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
|
||||
}
|
||||
for(JSONLDUtils.PrincipalInfo item : authors){
|
||||
if(item.name() == null || item.name().trim().length() == 0) continue;
|
||||
if(foundNames.contains(item.name())) continue;
|
||||
foundNames.add(item.name());
|
||||
|
||||
curated.add(new DatasetDocument.Creator(item.name(), item.affiliationNames()));
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class EndpointAccessIterator implements Iterator<JSONObject> {
|
||||
private static final Log log = LogFactory.getLog(EndpointAccessIterator.class);
|
||||
|
||||
public static class Options {
|
||||
|
||||
private Charset charset;
|
||||
|
||||
public Options(){}
|
||||
|
||||
public Options(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public void setCharset(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private Iterator<String> repositoryIterator;
|
||||
|
||||
public EndpointAccessIterator(Options options, Iterator<String> repositoryIterator) {
|
||||
this.options = options;
|
||||
this.repositoryIterator = repositoryIterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.repositoryIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public JSONObject next() {
|
||||
String endpoint = this.repositoryIterator.next();
|
||||
if(endpoint == null) return null;
|
||||
|
||||
log.debug(String.format("processing: %s", endpoint));
|
||||
|
||||
JSONObject dataset = this.extractDatasetRecord(endpoint);
|
||||
|
||||
return dataset;
|
||||
}
|
||||
|
||||
private JSONObject extractDatasetRecord(String endpoint) {
|
||||
JSONObject datasetDocument = null;
|
||||
try {
|
||||
URL urlEndpoint = new URL(endpoint);
|
||||
log.debug("downloading endpoint "+urlEndpoint);
|
||||
String payload = Utils.RemoteAccessWithRetry(3, 5000, urlEndpoint, this.options.getCharset());
|
||||
|
||||
log.trace("downloaded payload id: "+payload);
|
||||
Document doc = Jsoup.parse(payload);
|
||||
Elements scriptTags = doc.getElementsByTag("script");
|
||||
for (Element scriptTag : scriptTags) {
|
||||
if (!scriptTag.hasAttr("type")) continue;
|
||||
String scriptType = scriptTag.attr("type");
|
||||
if (!scriptType.equalsIgnoreCase("application/ld+json")) continue;
|
||||
|
||||
String data = scriptTag.data();
|
||||
JSONObject schemaItem = new JSONObject(data);
|
||||
String context = schemaItem.optString("@context");
|
||||
String type = schemaItem.optString("@type");
|
||||
|
||||
if (context == null || type == null) continue;
|
||||
|
||||
Boolean isSchemaOrgContext = context.toLowerCase().startsWith("http://schema.org") || context.toLowerCase().startsWith("https://schema.org");
|
||||
Boolean isDataset = type.equalsIgnoreCase("dataset");
|
||||
|
||||
if (!isSchemaOrgContext || !isDataset) continue;
|
||||
|
||||
log.debug(String.format("discovered dataset document: %s", schemaItem.toString()));
|
||||
|
||||
datasetDocument = schemaItem;
|
||||
break;
|
||||
}
|
||||
}catch(Exception ex){
|
||||
log.error("problem extracting dataset document. returning empty", ex);
|
||||
datasetDocument = null;
|
||||
}
|
||||
if(datasetDocument == null){
|
||||
log.debug("did not find any dataset document in endpoint");
|
||||
}
|
||||
else{
|
||||
log.debug("found dataset document in endpoint :"+datasetDocument.toString());
|
||||
}
|
||||
return datasetDocument;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,515 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class JSONLDUtils {
|
||||
|
||||
public interface PrincipalInfo{
|
||||
String name();
|
||||
List<String> affiliationNames();
|
||||
|
||||
}
|
||||
|
||||
public static class OrganizationInfo implements PrincipalInfo{
|
||||
public String name;
|
||||
|
||||
public String name(){return this.name;}
|
||||
|
||||
public List<String> affiliationNames(){
|
||||
return null;
|
||||
}
|
||||
|
||||
public OrganizationInfo(){}
|
||||
|
||||
public OrganizationInfo(String name){
|
||||
this.name = name;
|
||||
}
|
||||
}
|
||||
|
||||
public static class PersonInfo implements PrincipalInfo{
|
||||
public String name;
|
||||
public List<OrganizationInfo> affiliations;
|
||||
|
||||
public String name(){return this.name;}
|
||||
|
||||
public List<String> affiliationNames(){
|
||||
if(this.affiliations == null) return null;
|
||||
List<String> curated = new ArrayList<>();
|
||||
for(OrganizationInfo item : this.affiliations){
|
||||
if(item == null || item.name == null || item.name.trim().length() == 0) continue;;
|
||||
curated.add(item.name.trim());
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
public PersonInfo(){}
|
||||
|
||||
public PersonInfo(String name){
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public PersonInfo(String name, List<OrganizationInfo> affiliations){
|
||||
this.name = name;
|
||||
this.affiliations = affiliations;
|
||||
}
|
||||
}
|
||||
|
||||
public static class LicenseInfo{
|
||||
public String name;
|
||||
public String url;
|
||||
|
||||
public LicenseInfo(){}
|
||||
|
||||
public LicenseInfo(String url){
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public LicenseInfo(String url, String name){
|
||||
this.name = name;
|
||||
this.url = url;
|
||||
}
|
||||
}
|
||||
|
||||
public static class CitationInfo{
|
||||
public String url;
|
||||
|
||||
public CitationInfo(){}
|
||||
|
||||
public CitationInfo(String url){
|
||||
this.url = url;
|
||||
}
|
||||
}
|
||||
|
||||
public static class IdentifierInfo{
|
||||
public String value;
|
||||
public String type;
|
||||
|
||||
public IdentifierInfo(){}
|
||||
|
||||
public IdentifierInfo(String value){
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public IdentifierInfo(String value, String type){
|
||||
this.value = value;
|
||||
this.type = type;
|
||||
}
|
||||
}
|
||||
|
||||
public static class GeoCoordinatesInfo{
|
||||
public String latitude;
|
||||
public String longitude;
|
||||
|
||||
public GeoCoordinatesInfo(){}
|
||||
|
||||
public GeoCoordinatesInfo(String latitude, String longitude){
|
||||
this.latitude = latitude;
|
||||
this.longitude = longitude;
|
||||
}
|
||||
}
|
||||
|
||||
public static class GeoShapeInfo{
|
||||
public String box;
|
||||
|
||||
public GeoShapeInfo(){}
|
||||
|
||||
public GeoShapeInfo(String box){
|
||||
this.box = box;
|
||||
}
|
||||
}
|
||||
|
||||
public static class PlaceInfo{
|
||||
public String name;
|
||||
public List<GeoCoordinatesInfo> geoCoordinates;
|
||||
public List<GeoShapeInfo> geoShapes;
|
||||
|
||||
public PlaceInfo(){}
|
||||
|
||||
public PlaceInfo(String name, List<GeoCoordinatesInfo> geoCoordinates, List<GeoShapeInfo> geoShapes){
|
||||
this.name = name;
|
||||
this.geoCoordinates = geoCoordinates;
|
||||
this.geoShapes = geoShapes;
|
||||
}
|
||||
}
|
||||
|
||||
private static PlaceInfo extractPlaceSingle(JSONObject document){
|
||||
if(document == null || !"Place".equals(document.optString("@type"))) return null;
|
||||
String name = document.optString("name");
|
||||
List<GeoCoordinatesInfo> geoCoordinates = JSONLDUtils.extractGeoCoordinates(document, "geo");
|
||||
List<GeoShapeInfo> geoShapes = JSONLDUtils.extractGeoShapes(document, "geo");
|
||||
if((name==null || name.trim().length() == 0) &&
|
||||
(geoCoordinates == null || geoCoordinates.size() == 0) &&
|
||||
(geoShapes == null || geoShapes.size() == 0)) return null;
|
||||
return new PlaceInfo(name, geoCoordinates, geoShapes);
|
||||
}
|
||||
|
||||
public static List<PlaceInfo> extractPlaces(JSONObject document, String key) {
|
||||
List<PlaceInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
PlaceInfo nfo = JSONLDUtils.extractPlaceSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
PlaceInfo nfo = JSONLDUtils.extractPlaceSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static GeoCoordinatesInfo extractGeoCoordinatesSingle(JSONObject document){
|
||||
if(document == null || !"GeoCoordinates".equals(document.optString("@type"))) return null;
|
||||
String latitude = document.optString("latitude");
|
||||
String longitude = document.optString("longitude");
|
||||
if(latitude==null || latitude.trim().length()==0 || longitude==null || longitude.trim().length()==0) return null;
|
||||
return new GeoCoordinatesInfo(latitude, longitude);
|
||||
}
|
||||
|
||||
private static List<GeoCoordinatesInfo> extractGeoCoordinates(JSONObject document, String key) {
|
||||
List<GeoCoordinatesInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
GeoCoordinatesInfo nfo = JSONLDUtils.extractGeoCoordinatesSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
GeoCoordinatesInfo nfo = JSONLDUtils.extractGeoCoordinatesSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static GeoShapeInfo extractGeoShapeSingle(JSONObject document){
|
||||
if(document == null || !"GeoShape".equals(document.optString("@type"))) return null;
|
||||
String box = document.optString("box");
|
||||
if(box==null || box.trim().length()==0 ) return null;
|
||||
return new GeoShapeInfo(box);
|
||||
}
|
||||
|
||||
private static List<GeoShapeInfo> extractGeoShapes(JSONObject document, String key) {
|
||||
List<GeoShapeInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
GeoShapeInfo nfo = JSONLDUtils.extractGeoShapeSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
GeoShapeInfo nfo = JSONLDUtils.extractGeoShapeSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static OrganizationInfo extractOrganizationSingle(JSONObject document){
|
||||
if(document == null || !"Organization".equals(document.optString("@type"))) return null;
|
||||
String name = document.optString("name");
|
||||
if(name==null || name.trim().length()==0) return null;
|
||||
return new OrganizationInfo(name);
|
||||
}
|
||||
|
||||
private static List<OrganizationInfo> extractOrganization(JSONObject document, String key) {
|
||||
List<OrganizationInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
OrganizationInfo nfo = JSONLDUtils.extractOrganizationSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
OrganizationInfo nfo = JSONLDUtils.extractOrganizationSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static PersonInfo extractPersonSingle(JSONObject document) {
|
||||
if(document == null || !"Person".equals(document.optString("@type"))) return null;
|
||||
String name = document.optString("name");
|
||||
String givenName = document.optString("givenName");
|
||||
String familyName = document.optString("familyName");
|
||||
if ((name == null || name.trim().length() == 0) && (givenName!=null || familyName !=null)) {
|
||||
if(givenName !=null && familyName!=null) name = String.join(" ", familyName, givenName).trim();
|
||||
else if (givenName == null) name = familyName;
|
||||
else if (familyName == null) name = givenName;
|
||||
}
|
||||
if(name==null || name.trim().length()==0) return null;
|
||||
List<OrganizationInfo> affiliations = JSONLDUtils.extractOrganization(document, "affiliation");
|
||||
return new PersonInfo(name, affiliations);
|
||||
}
|
||||
|
||||
private static List<PersonInfo> extractPerson(JSONObject document, String key) {
|
||||
List<PersonInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
PersonInfo nfo = JSONLDUtils.extractPersonSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
PersonInfo nfo = JSONLDUtils.extractPersonSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(new PersonInfo(value));
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static PrincipalInfo extractPrincipalSingle(JSONObject document) {
|
||||
PrincipalInfo principal = JSONLDUtils.extractPersonSingle(document);
|
||||
if(principal == null) principal = JSONLDUtils.extractOrganizationSingle(document);
|
||||
return principal;
|
||||
}
|
||||
|
||||
public static List<PrincipalInfo> extractPrincipal(JSONObject document, String key) {
|
||||
List<PrincipalInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
PrincipalInfo nfo = JSONLDUtils.extractPrincipalSingle(array.optJSONObject(i));
|
||||
if(nfo!=null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
PrincipalInfo nfo = JSONLDUtils.extractPrincipalSingle(obj);
|
||||
if(nfo!=null) items.add(nfo);
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(new PersonInfo(value));
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static List<String> extractString(JSONObject document, String key){
|
||||
List<String> items = new ArrayList<>();
|
||||
|
||||
if (!document.has(key)) return items;
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if(item != null) continue;
|
||||
String value = array.optString(i);
|
||||
if(value == null) continue;
|
||||
items.add(value);
|
||||
}
|
||||
} else if (obj == null) {
|
||||
String value = document.optString(key);
|
||||
if(value != null) items.add(value);
|
||||
}
|
||||
|
||||
return items;
|
||||
|
||||
}
|
||||
|
||||
public static List<String> extractSize(JSONObject document, String key){
|
||||
List<String> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if (item == null || !"DataDownload".equals((item.optString("@type")))) continue;
|
||||
String size = item.optString("contentSize");
|
||||
if (size != null) items.add(size);
|
||||
}
|
||||
} else if (obj != null) {
|
||||
String size = obj.optString("contentSize");
|
||||
if ("DataDownload".equals((obj.optString("@type"))) && size != null) {
|
||||
items.add(size);
|
||||
}
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static List<String> extractEncodingFormat(JSONObject document, String key){
|
||||
List<String> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if (item == null || !"DataDownload".equals((item.optString("@type")))) continue;
|
||||
String encodingFormat = item.optString("encodingFormat");
|
||||
if (encodingFormat != null) items.add(encodingFormat);
|
||||
String fileFormat = item.optString("fileFormat");
|
||||
if (fileFormat != null) items.add(fileFormat);
|
||||
}
|
||||
} else if (obj != null) {
|
||||
if ("DataDownload".equals((obj.optString("@type")))) {
|
||||
String encodingFormat = obj.optString("encodingFormat");
|
||||
if (encodingFormat != null) items.add(encodingFormat);
|
||||
String fileFormat = obj.optString("fileFormat");
|
||||
if (fileFormat != null) items.add(fileFormat);
|
||||
}
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static List<String> extractLanguage(JSONObject document, String key){
|
||||
List<String> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if (item == null) {
|
||||
String value = array.optString(i);
|
||||
if (value != null) items.add(value);
|
||||
} else {
|
||||
if (!"Language".equals((item.optString("@type")))) continue;
|
||||
String name = item.optString("name");
|
||||
if (name != null) items.add(name);
|
||||
String alternateName = item.optString("alternateName");
|
||||
if (alternateName != null) items.add(alternateName);
|
||||
}
|
||||
}
|
||||
} else if (obj != null) {
|
||||
if ("Language".equals((obj.optString("@type")))){
|
||||
String name = obj.optString("name");
|
||||
if (name != null) items.add(name);
|
||||
String alternateName = obj.optString("alternateName");
|
||||
if (alternateName != null) items.add(alternateName);
|
||||
}
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(value);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static List<LicenseInfo> extractLicenses(JSONObject document, String key){
|
||||
List<LicenseInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if (item == null) {
|
||||
String value = array.optString(i);
|
||||
if(value != null) items.add(new LicenseInfo(value));
|
||||
} else {
|
||||
if (!"CreativeWork".equals((item.optString("@type")))) continue;
|
||||
String url = item.optString("url");
|
||||
String name = item.optString("name");
|
||||
if (url != null || name != null) items.add(new LicenseInfo(url, name));
|
||||
}
|
||||
}
|
||||
} else if (obj != null) {
|
||||
if("CreativeWork".equals((obj.optString("@type")))) {
|
||||
String url = obj.optString("url");
|
||||
String name = obj.optString("name");
|
||||
if (url != null || name != null) items.add(new LicenseInfo(url, name));
|
||||
}
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(new LicenseInfo(value));
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public static List<CitationInfo> extractCitations(JSONObject document, String key){
|
||||
List<CitationInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
JSONObject item = array.optJSONObject(i);
|
||||
if (item == null) {
|
||||
String value = array.optString(i);
|
||||
if(value != null) items.add(new CitationInfo(value));
|
||||
} else {
|
||||
if (!"CreativeWork".equals((item.optString("@type")))) continue;
|
||||
String url = item.optString("url");
|
||||
if (url != null) items.add(new CitationInfo(url));
|
||||
}
|
||||
}
|
||||
} else if (obj != null) {
|
||||
if("CreativeWork".equals((obj.optString("@type")))) {
|
||||
String url = obj.optString("url");
|
||||
if (url != null) items.add(new CitationInfo(url));
|
||||
}
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(new CitationInfo(value));
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
private static IdentifierInfo extractIdentifierSingle(JSONObject document){
|
||||
if(document == null || !"PropertyValue".equals(document.optString("@type"))) return null;
|
||||
String name = document.optString("name");
|
||||
String value = document.optString("value");
|
||||
if(value==null || value.trim().length()==0) return null;
|
||||
return new IdentifierInfo(value, name);
|
||||
}
|
||||
|
||||
public static List<IdentifierInfo> extractIdentifier(JSONObject document, String key) {
|
||||
List<IdentifierInfo> items = new ArrayList<>();
|
||||
|
||||
JSONArray array = document.optJSONArray(key);
|
||||
JSONObject obj = document.optJSONObject(key);
|
||||
|
||||
if (array != null) {
|
||||
for (int i = 0; i < array.length(); i += 1) {
|
||||
IdentifierInfo nfo = null;
|
||||
if (array.optJSONObject(i) == null) {
|
||||
String value = array.optString(i);
|
||||
if (value != null) nfo = new IdentifierInfo(value);
|
||||
}
|
||||
if (nfo == null) nfo = JSONLDUtils.extractIdentifierSingle(array.optJSONObject(i));
|
||||
if (nfo != null) items.add(nfo);
|
||||
}
|
||||
}else if (obj!=null) {
|
||||
IdentifierInfo nfo = JSONLDUtils.extractIdentifierSingle(obj);
|
||||
if (nfo != null) items.add(nfo);
|
||||
} else {
|
||||
String value = document.optString(key);
|
||||
if (value != null) items.add(new IdentifierInfo(value));
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
public interface RepositoryIterable extends Iterable<String> {
|
||||
public static String TerminationHint = "df667391-676d-4c0f-9c40-426b1001607a";
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class RepositoryQueueIterator implements Iterator<String> {
|
||||
private static final Log log = LogFactory.getLog(RepositoryQueueIterator.class);
|
||||
|
||||
public static class Options {
|
||||
private Boolean blockPolling;
|
||||
private long pollTimeout;
|
||||
private TimeUnit pollTimeoutUnit;
|
||||
|
||||
public Boolean getBlockPolling() {
|
||||
return blockPolling;
|
||||
}
|
||||
|
||||
public void setBlockPolling(Boolean blockPolling) {
|
||||
this.blockPolling = blockPolling;
|
||||
}
|
||||
|
||||
public long getPollTimeout() {
|
||||
return pollTimeout;
|
||||
}
|
||||
|
||||
public void setPollTimeout(long pollTimeout) {
|
||||
this.pollTimeout = pollTimeout;
|
||||
}
|
||||
|
||||
public TimeUnit getPollTimeoutUnit() {
|
||||
return pollTimeoutUnit;
|
||||
}
|
||||
|
||||
public void setPollTimeoutUnit(TimeUnit pollTimeoutUnit) {
|
||||
this.pollTimeoutUnit = pollTimeoutUnit;
|
||||
}
|
||||
}
|
||||
|
||||
private ArrayBlockingQueue<String> queue;
|
||||
private Options options;
|
||||
private boolean hasTerminated;
|
||||
|
||||
public RepositoryQueueIterator(Options options, ArrayBlockingQueue<String> queue) {
|
||||
this.options = options;
|
||||
this.queue = queue;
|
||||
this.hasTerminated = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if(this.hasTerminated) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String next = this.poll();
|
||||
log.debug("next endpoint to process: " + next);
|
||||
if (next != null && next.equalsIgnoreCase(RepositoryIterable.TerminationHint)) {
|
||||
log.debug("no more endpoints to process");
|
||||
this.hasTerminated = true;
|
||||
next = null;
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
private String poll(){
|
||||
String item = null;
|
||||
log.debug("retrieving endpoint from queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
if(this.options.getBlockPolling()) {
|
||||
try {
|
||||
item = this.queue.poll(this.options.getPollTimeout(), this.options.getPollTimeoutUnit());
|
||||
} catch (InterruptedException ex) {
|
||||
log.warn(String.format("could not poll elements from queue for more than %s %s. throwing", this.options.getPollTimeout(), this.options.getPollTimeoutUnit()));
|
||||
throw new NoSuchElementException(ex.getMessage());
|
||||
}
|
||||
}
|
||||
else {
|
||||
item = this.queue.poll();
|
||||
}
|
||||
log.debug("retrieved endpoint from queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
return item;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
|
||||
public class SchemaOrgIterable implements Iterable<String> {
|
||||
private static final Log log = LogFactory.getLog(SchemaOrgIterable.class);
|
||||
|
||||
public static class Options {
|
||||
private EndpointAccessIterator.Options endpointAccessOptions;
|
||||
private DatasetMappingIterator.Options datasetMappingOptions;
|
||||
|
||||
public EndpointAccessIterator.Options getEndpointAccessOptions() {
|
||||
return endpointAccessOptions;
|
||||
}
|
||||
|
||||
public void setEndpointAccessOptions(EndpointAccessIterator.Options endpointAccessOptions) {
|
||||
this.endpointAccessOptions = endpointAccessOptions;
|
||||
}
|
||||
|
||||
public DatasetMappingIterator.Options getDatasetMappingOptions() {
|
||||
return datasetMappingOptions;
|
||||
}
|
||||
|
||||
public void setDatasetMappingOptions(DatasetMappingIterator.Options datasetMappingOptions) {
|
||||
this.datasetMappingOptions = datasetMappingOptions;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private RepositoryIterable repository;
|
||||
|
||||
public SchemaOrgIterable(Options options, RepositoryIterable repository){
|
||||
this.options = options;
|
||||
this.repository = repository;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
Iterator<String> repositoryIterator = this.repository.iterator();
|
||||
EndpointAccessIterator endpointAccessIterator = new EndpointAccessIterator(options.getEndpointAccessOptions(), repositoryIterator);
|
||||
DatasetMappingIterator datasetMappingIterator = new DatasetMappingIterator(options.getDatasetMappingOptions(), endpointAccessIterator);
|
||||
|
||||
return datasetMappingIterator;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.PatternLayout;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SchemaOrgMainKaggle {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SchemaOrgMainKaggle.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
ConsoleAppender console = new ConsoleAppender();
|
||||
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
|
||||
console.setThreshold(Level.DEBUG);
|
||||
console.activateOptions();
|
||||
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
|
||||
|
||||
HashMap<String,String> params = new HashMap<>();
|
||||
params.put("consumerBlockPolling", Boolean.toString(true));
|
||||
params.put("consumerBlockPollingTimeout", "2");
|
||||
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
|
||||
params.put("endpointCharset", StandardCharsets.UTF_8.name());
|
||||
params.put("updatedDateFormat", "YYYY-MM-DD");
|
||||
params.put("createdDateFormat", "YYYY-MM-DD");
|
||||
params.put("publicationDateFormat", "YYYY-MM-DD");
|
||||
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
|
||||
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
|
||||
params.put("identifierFallbackURL", Boolean.toString(true));
|
||||
params.put("identifierMappingARK", "ark, ARK");
|
||||
params.put("identifierMappingDOI", "doi, DOI");
|
||||
params.put("identifierMappingHandle", "Handle, HANDLE");
|
||||
params.put("identifierMappingPURL", "purl, PURL");
|
||||
params.put("identifierMappingURN", "urn, URN");
|
||||
params.put("identifierMappingURL", "url, URL");
|
||||
|
||||
params.put("repositoryAccessType", "httpapi-kaggle");
|
||||
|
||||
params.put("httpapi-kaggle_queueSize", "100");
|
||||
params.put("httpapi-kaggle_APICharset", StandardCharsets.UTF_8.name());
|
||||
params.put("httpapi-kaggle_queryUrl", "https://www.kaggle.com/datasets_v2.json?sortBy=updated&group=public&page={PAGE}&pageSize=20&size=sizeAll&filetype=fileTypeAll&license=licenseAll");
|
||||
params.put("httpapi-kaggle_queryPagePlaceholder", "{PAGE}");
|
||||
params.put("httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems");
|
||||
params.put("httpapi-kaggle_responsePropertyDatasetList", "datasetListItems");
|
||||
params.put("httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl");
|
||||
params.put("httpapi-kaggle_responseBaseDatasetUrl", "https://www.kaggle.com");
|
||||
params.put("httpapi-kaggle_producerBlockPollingTimeout", "2");
|
||||
params.put("httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
|
||||
|
||||
InterfaceDescriptor descriptor = new InterfaceDescriptor();
|
||||
descriptor.setId("schema.org - kaggle");
|
||||
descriptor.setBaseUrl("https://www.kaggle.com");
|
||||
|
||||
descriptor.setParams(params);
|
||||
|
||||
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
|
||||
|
||||
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
|
||||
|
||||
String outDir = params.get("repositoryAccessType");
|
||||
|
||||
log.info("saving content in " + outDir);
|
||||
|
||||
File directory = new File(outDir);
|
||||
if (directory.exists()) {
|
||||
log.info(directory.getAbsolutePath() + " exists, cleaning up");
|
||||
FileUtils.deleteDirectory(directory);
|
||||
}
|
||||
FileUtils.forceMkdir(directory);
|
||||
Utils.writeFiles(iterable, outDir);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.PatternLayout;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SchemaOrgMainReactome {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SchemaOrgMainReactome.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
ConsoleAppender console = new ConsoleAppender();
|
||||
console.setLayout(new PatternLayout("%d [%p|%c|%C{1}] %m%n"));
|
||||
console.setThreshold(Level.DEBUG);
|
||||
console.activateOptions();
|
||||
Logger.getLogger("eu.dnetlib.data.collector.plugins").addAppender(console);
|
||||
|
||||
HashMap<String,String> params = new HashMap<>();
|
||||
params.put("consumerBlockPolling", Boolean.toString(true));
|
||||
params.put("consumerBlockPollingTimeout", "2");
|
||||
params.put("consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
|
||||
params.put("endpointCharset", StandardCharsets.UTF_8.name());
|
||||
params.put("updatedDateFormat", "YYYY-MM-DD");
|
||||
params.put("createdDateFormat", "YYYY-MM-DD");
|
||||
params.put("publicationDateFormat", "YYYY-MM-DD");
|
||||
params.put("contributorFallbackType", DatasetDocument.Contributor.ContributorType.Other.toString());
|
||||
params.put("identifierFallbackType", DatasetDocument.Identifier.IdentifierType.Handle.toString());
|
||||
params.put("identifierFallbackURL", Boolean.toString(true));
|
||||
params.put("identifierMappingARK", "ark, ARK");
|
||||
params.put("identifierMappingDOI", "doi, DOI");
|
||||
params.put("identifierMappingHandle", "Handle, HANDLE");
|
||||
params.put("identifierMappingPURL", "purl, PURL");
|
||||
params.put("identifierMappingURN", "urn, URN");
|
||||
params.put("identifierMappingURL", "url, URL");
|
||||
|
||||
params.put("repositoryAccessType", "sitemapindex");
|
||||
params.put("sitemap_queueSize", "100");
|
||||
params.put("sitemap_IndexCharset", StandardCharsets.UTF_8.name());
|
||||
params.put("sitemap_FileCharset", StandardCharsets.UTF_8.name());
|
||||
params.put("sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Text.toString());
|
||||
params.put("sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.GZ.toString());
|
||||
params.put("sitemap_producerBlockPollingTimeout", "2");
|
||||
params.put("sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES.toString());
|
||||
|
||||
InterfaceDescriptor descriptor = new InterfaceDescriptor();
|
||||
descriptor.setId("schema.org - reactome");
|
||||
descriptor.setBaseUrl("https://reactome.org/sitemapindex.xml");
|
||||
|
||||
descriptor.setParams(params);
|
||||
|
||||
SchemaOrgPlugin schemaOrgPlugin = new SchemaOrgPlugin();
|
||||
|
||||
Iterable<String> iterable = schemaOrgPlugin.collect(descriptor, null, null);
|
||||
|
||||
String outDir = params.get("repositoryAccessType");
|
||||
|
||||
log.info("saving content in " + outDir);
|
||||
|
||||
File directory = new File(outDir);
|
||||
if (directory.exists()) {
|
||||
log.info(directory.getAbsolutePath() + " exists, cleaning up");
|
||||
FileUtils.deleteDirectory(directory);
|
||||
}
|
||||
FileUtils.forceMkdir(directory);
|
||||
Utils.writeFiles(iterable, outDir);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle.KaggleRepositoryIterable;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapFileIterator;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexIterator;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex.SitemapIndexRepositoryIterable;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SchemaOrgPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SchemaOrgPlugin.class);
|
||||
|
||||
public String hello(){
|
||||
return "hello";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String untilDate) throws CollectorServiceException {
|
||||
try {
|
||||
RepositoryIterable repository = null;
|
||||
String repositoryAccessType = Utils.getAsString(interfaceDescriptor.getParams(), "repositoryAccessType", null);
|
||||
switch(repositoryAccessType) {
|
||||
case "sitemapindex": {
|
||||
SitemapIndexRepositoryIterable.Options repositoryOptions = this.compileSitemapIndexRepositoryOptions(interfaceDescriptor);
|
||||
SitemapIndexRepositoryIterable repositoryIterable = new SitemapIndexRepositoryIterable(repositoryOptions);
|
||||
repositoryIterable.bootstrap();
|
||||
repository = repositoryIterable;
|
||||
break;
|
||||
}
|
||||
case "httpapi-kaggle": {
|
||||
KaggleRepositoryIterable.Options repositoryOptions = this.compileKaggleRepositoryOptions(interfaceDescriptor);
|
||||
KaggleRepositoryIterable repositoryIterable = new KaggleRepositoryIterable(repositoryOptions);
|
||||
repositoryIterable.bootstrap();
|
||||
repository = repositoryIterable;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new CollectorServiceException(String.format("unrecognized repository access type ", repositoryAccessType));
|
||||
}
|
||||
SchemaOrgIterable.Options schemaOrgOptions = this.compileSchemaOrgOptions(interfaceDescriptor);
|
||||
SchemaOrgIterable iterable = new SchemaOrgIterable(schemaOrgOptions, repository);
|
||||
return iterable;
|
||||
} catch (Exception e) {
|
||||
throw new CollectorServiceException("Could not create iterator", e);
|
||||
}
|
||||
}
|
||||
|
||||
private KaggleRepositoryIterable.Options compileKaggleRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
KaggleRepositoryIterable.Options kaggleRepositoryOptions = new KaggleRepositoryIterable.Options();
|
||||
kaggleRepositoryOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "httpapi-kaggle_queueSize", 100));
|
||||
kaggleRepositoryOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeout", 20));
|
||||
kaggleRepositoryOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "httpapi-kaggle_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
|
||||
kaggleRepositoryOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "httpapi-kaggle_APICharset", StandardCharsets.UTF_8));
|
||||
kaggleRepositoryOptions.setQueryUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryUrl", null));
|
||||
kaggleRepositoryOptions.setQueryPagePlaceholder(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_queryPagePlaceholder", "{PAGE}"));
|
||||
kaggleRepositoryOptions.setResponsePropertyTotalDataset(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyTotalDataset", "totalDatasetListItems"));
|
||||
kaggleRepositoryOptions.setResponsePropertyDatasetList(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetList", "datasetListItems"));
|
||||
kaggleRepositoryOptions.setResponsePropertyDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responsePropertyDatasetUrl", "datasetUrl"));
|
||||
kaggleRepositoryOptions.setResponseBaseDatasetUrl(Utils.getAsString(interfaceDescriptor.getParams(), "httpapi-kaggle_responseBaseDatasetUrl", interfaceDescriptor.getBaseUrl()));
|
||||
kaggleRepositoryOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
|
||||
return kaggleRepositoryOptions;
|
||||
|
||||
}
|
||||
|
||||
private SitemapIndexIterator.Options compileSitemapIndexOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
SitemapIndexIterator.Options sitemapIndexIteratorOptions = new SitemapIndexIterator.Options();
|
||||
sitemapIndexIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_IndexCharset", StandardCharsets.UTF_8));
|
||||
sitemapIndexIteratorOptions.setIndexUrl(new URL(interfaceDescriptor.getBaseUrl()));
|
||||
return sitemapIndexIteratorOptions;
|
||||
|
||||
}
|
||||
|
||||
private SitemapFileIterator.Options compileSitemapFileOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
SitemapFileIterator.Options sitemapFileIteratorOptions = new SitemapFileIterator.Options();
|
||||
sitemapFileIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "sitemap_FileCharset", StandardCharsets.UTF_8));
|
||||
sitemapFileIteratorOptions.setSchemaType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileSchema", SitemapFileIterator.Options.SitemapSchemaType.Xml, SitemapFileIterator.Options.SitemapSchemaType.class));
|
||||
sitemapFileIteratorOptions.setFileType(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_FileType", SitemapFileIterator.Options.SitemapFileType.Text, SitemapFileIterator.Options.SitemapFileType.class));
|
||||
return sitemapFileIteratorOptions;
|
||||
}
|
||||
|
||||
private RepositoryQueueIterator.Options compileRepositoryQueueOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
RepositoryQueueIterator.Options repositoryQueueIteratorOptions = new RepositoryQueueIterator.Options();
|
||||
repositoryQueueIteratorOptions.setBlockPolling(Utils.getAsBoolean(interfaceDescriptor.getParams(), "consumerBlockPolling", true));
|
||||
repositoryQueueIteratorOptions.setPollTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "consumerBlockPollingTimeout", 2));
|
||||
repositoryQueueIteratorOptions.setPollTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "consumerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
|
||||
return repositoryQueueIteratorOptions;
|
||||
}
|
||||
|
||||
private SitemapIndexRepositoryIterable.Options compileSitemapIndexRepositoryOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
SitemapIndexRepositoryIterable.Options sitemapIndexRepositoryIterableOptions = new SitemapIndexRepositoryIterable.Options();
|
||||
sitemapIndexRepositoryIterableOptions.setQueueSize(Utils.getAsInt(interfaceDescriptor.getParams(), "sitemap_queueSize", 100));
|
||||
sitemapIndexRepositoryIterableOptions.setPutTimeout(Utils.getAsLong(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeout", 20));
|
||||
sitemapIndexRepositoryIterableOptions.setPutTimeoutUnit(Utils.getAsEnum(interfaceDescriptor.getParams(), "sitemap_producerBlockPollingTimeoutUnit", TimeUnit.MINUTES, TimeUnit.class));
|
||||
sitemapIndexRepositoryIterableOptions.setRepositoryQueueIteratorOptions(this.compileRepositoryQueueOptions(interfaceDescriptor));
|
||||
sitemapIndexRepositoryIterableOptions.setSitemapFileIteratorOptions(this.compileSitemapFileOptions(interfaceDescriptor));
|
||||
sitemapIndexRepositoryIterableOptions.setSitemapIndexIteratorOptions(this.compileSitemapIndexOptions(interfaceDescriptor));
|
||||
return sitemapIndexRepositoryIterableOptions;
|
||||
}
|
||||
|
||||
private EndpointAccessIterator.Options compileEndpointAccessOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
EndpointAccessIterator.Options endpointAccessIteratorOptions = new EndpointAccessIterator.Options();
|
||||
endpointAccessIteratorOptions.setCharset(Utils.getAsCharset(interfaceDescriptor.getParams(), "endpointCharset", StandardCharsets.UTF_8));
|
||||
return endpointAccessIteratorOptions;
|
||||
}
|
||||
|
||||
private DatasetMappingIterator.Options compileDatasetMappingOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
DatasetMappingIterator.Options datasetMappingIteratorOptions = new DatasetMappingIterator.Options();
|
||||
|
||||
DatasetMappingIterator.Options.UpdatedDateOptions datasetMappingIteratorUpdatedDateOptions = new DatasetMappingIterator.Options.UpdatedDateOptions();
|
||||
datasetMappingIteratorUpdatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "updatedDateFormat", "YYYY-MM-DD");
|
||||
datasetMappingIteratorOptions.setUpdatedDateOptions(datasetMappingIteratorUpdatedDateOptions);
|
||||
|
||||
DatasetMappingIterator.Options.CreatedDateOptions datasetMappingIteratorCreatedDateOptions = new DatasetMappingIterator.Options.CreatedDateOptions();
|
||||
datasetMappingIteratorCreatedDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "createdDateFormat", "YYYY-MM-DD");
|
||||
datasetMappingIteratorOptions.setCreatedDateOptions(datasetMappingIteratorCreatedDateOptions);
|
||||
|
||||
DatasetMappingIterator.Options.PublicationDateOptions datasetMappingIteratorPublicationDateOptions = new DatasetMappingIterator.Options.PublicationDateOptions();
|
||||
datasetMappingIteratorPublicationDateOptions.format =Utils.getAsString(interfaceDescriptor.getParams(), "publicationDateFormat", "YYYY-MM-DD");
|
||||
datasetMappingIteratorOptions.setPublicationDateOptions(datasetMappingIteratorPublicationDateOptions);
|
||||
|
||||
DatasetMappingIterator.Options.ContributorOptions datasetMappingIteratorContributorOptions = new DatasetMappingIterator.Options.ContributorOptions();
|
||||
datasetMappingIteratorContributorOptions.fallbackType =Utils.getAsEnum(interfaceDescriptor.getParams(), "contributorFallbackType",DatasetDocument.Contributor.ContributorType.Other, DatasetDocument.Contributor.ContributorType.class);
|
||||
datasetMappingIteratorOptions.setContributorOptions(datasetMappingIteratorContributorOptions);
|
||||
|
||||
DatasetMappingIterator.Options.IdentifierOptions datasetMappingIteratorIdentifierOptions = new DatasetMappingIterator.Options.IdentifierOptions();
|
||||
datasetMappingIteratorIdentifierOptions.fallbackType = Utils.getAsEnum(interfaceDescriptor.getParams(), "identifierFallbackType", null, DatasetDocument.Identifier.IdentifierType.class);
|
||||
datasetMappingIteratorIdentifierOptions.fallbackURL = Utils.getAsBoolean(interfaceDescriptor.getParams(), "identifierFallbackURL", true);
|
||||
datasetMappingIteratorIdentifierOptions.mappingARK = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingARK", null);
|
||||
datasetMappingIteratorIdentifierOptions.mappingDOI = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingDOI", null);
|
||||
datasetMappingIteratorIdentifierOptions.mappingHandle = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingHandle", null);
|
||||
datasetMappingIteratorIdentifierOptions.mappingPURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingPURL", null);
|
||||
datasetMappingIteratorIdentifierOptions.mappingURL = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURL", null);
|
||||
datasetMappingIteratorIdentifierOptions.mappingURN = Utils.getAsStringCsv(interfaceDescriptor.getParams(), "identifierMappingURN", null);
|
||||
datasetMappingIteratorOptions.setIdentifierOptions(datasetMappingIteratorIdentifierOptions);
|
||||
return datasetMappingIteratorOptions;
|
||||
}
|
||||
|
||||
private SchemaOrgIterable.Options compileSchemaOrgOptions(InterfaceDescriptor interfaceDescriptor) throws MalformedURLException {
|
||||
SchemaOrgIterable.Options schemaOrgIterableOptions = new SchemaOrgIterable.Options();
|
||||
schemaOrgIterableOptions.setDatasetMappingOptions(this.compileDatasetMappingOptions(interfaceDescriptor));
|
||||
schemaOrgIterableOptions.setEndpointAccessOptions(this.compileEndpointAccessOptions(interfaceDescriptor));
|
||||
return schemaOrgIterableOptions;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
public class Utils {
|
||||
private static final Log log = LogFactory.getLog(Utils.class);
|
||||
|
||||
public static List<String> collectAsStrings(String xml, String xpath) throws Exception{
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(new InputSource(new StringReader(xml)));
|
||||
return Utils.collectAsStrings(doc, xpath);
|
||||
}
|
||||
|
||||
public static List<String> collectAsStrings(File file, String xpath) throws Exception{
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
Document doc = builder.parse(file);
|
||||
return Utils.collectAsStrings(doc, xpath);
|
||||
}
|
||||
|
||||
public static List<String> collectAsStrings(Document doc, String xpath) throws Exception{
|
||||
XPathFactory xPathfactory = XPathFactory.newInstance();
|
||||
XPath path = xPathfactory.newXPath();
|
||||
XPathExpression expr = path.compile(xpath);
|
||||
NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
|
||||
|
||||
List<String> values = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < nodes.getLength(); i++)
|
||||
values.add(nodes.item(i).getNodeValue());
|
||||
|
||||
return values;
|
||||
}
|
||||
|
||||
public static void decompressGZipTo(File input, File output) throws Exception {
|
||||
try (GZIPInputStream in = new GZIPInputStream(new FileInputStream(input))){
|
||||
try (FileOutputStream out = new FileOutputStream(output)){
|
||||
byte[] buffer = new byte[1024];
|
||||
int len;
|
||||
while((len = in.read(buffer)) != -1){
|
||||
out.write(buffer, 0, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String getAsString(HashMap<String,String> map, String key, String defaultValue)
|
||||
{
|
||||
String value = map.get(key);
|
||||
if(value == null) return defaultValue;
|
||||
return value;
|
||||
}
|
||||
|
||||
public static List<String> getAsStringCsv(HashMap<String,String> map, String key, List<String> defaultValue)
|
||||
{
|
||||
String value = map.get(key);
|
||||
if(value == null) return defaultValue;
|
||||
String[] splits = value.split(",");
|
||||
List<String> curated = new ArrayList<>();
|
||||
for(String item : splits){
|
||||
if(item == null || item.trim().length() == 0) continue;
|
||||
curated.add(item.trim());
|
||||
}
|
||||
return curated;
|
||||
}
|
||||
|
||||
public static int getAsInt(HashMap<String,String> map, String key, int defaultValue)
|
||||
{
|
||||
String value = map.get(key);
|
||||
if(value == null) return defaultValue;
|
||||
try {
|
||||
return Integer.parseInt(value);
|
||||
} catch (NumberFormatException e) {
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
public static long getAsLong(HashMap<String,String> map, String key, long defaultValue)
|
||||
{
|
||||
String value = map.get(key);
|
||||
if(value == null) return defaultValue;
|
||||
try {
|
||||
return Long.parseLong(value);
|
||||
} catch (NumberFormatException e) {
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
public static <E extends Enum<E>> E getAsEnum(HashMap<String,String> map, String key, E defaultValue, Class<E> clazz) {
|
||||
//EnumSet<E> values = EnumSet.allOf(defaultValue.getClass());
|
||||
EnumSet<E> values = EnumSet.allOf(clazz);
|
||||
String value = map.get(key);
|
||||
if (value == null) return defaultValue;
|
||||
for(E val : values){
|
||||
if(!val.name().equalsIgnoreCase(value)) continue;
|
||||
return val;
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
public static Boolean getAsBoolean(HashMap<String,String> map, String key, Boolean defaultValue) {
|
||||
String value = map.get(key);
|
||||
if (value == null) return defaultValue;
|
||||
return Boolean.parseBoolean(value);
|
||||
}
|
||||
|
||||
public static Charset getAsCharset(HashMap<String,String> map, String key, Charset defaultValue)
|
||||
{
|
||||
String value = map.get(key);
|
||||
if(value == null) return defaultValue;
|
||||
try {
|
||||
return Charset.forName(value);
|
||||
} catch (UnsupportedCharsetException e) {
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static String RemoteAccessWithRetry(int retryCount, long waitBetweenRetriesMillis, URL endpoint, Charset charset) throws IOException {
|
||||
int retry =0;
|
||||
while(retry < retryCount) {
|
||||
try {
|
||||
return IOUtils.toString(endpoint, charset);
|
||||
} catch (Exception ex) {
|
||||
retry += 1;
|
||||
if (retry < retryCount) {
|
||||
log.debug("problem accessing url " + endpoint + ". will retry after " + waitBetweenRetriesMillis + " milliseconds");
|
||||
try {
|
||||
Thread.sleep(waitBetweenRetriesMillis);
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
else{
|
||||
log.debug("problem accessing url " + endpoint + ". throwing");
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static Boolean validateXml(String xml){
|
||||
try {
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
InputSource is = new InputSource(new StringReader(xml));
|
||||
builder.parse(is);
|
||||
return true;
|
||||
}catch(Exception ex){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static void writeFiles(final Iterable<String> iterable, final String outDir) throws DocumentException, IOException {
|
||||
|
||||
int skipped = 0;
|
||||
int count = 0;
|
||||
|
||||
for(String item : iterable) {
|
||||
|
||||
final org.dom4j.Document doc = new SAXReader().read(new StringReader(item));
|
||||
|
||||
if (StringUtils.isNotBlank(doc.valueOf("/*[local-name() = 'dataset']/*[local-name() = 'identifier']/text()"))) {
|
||||
log.info(item);
|
||||
String fileName = outDir + "/" + count++;
|
||||
|
||||
try(BufferedWriter w = new BufferedWriter(new FileWriter(fileName))) {
|
||||
w.write(item);
|
||||
}
|
||||
log.info("wrote " + fileName);
|
||||
} else {
|
||||
skipped++;
|
||||
}
|
||||
if (skipped % 100 == 0) {
|
||||
log.info("skipped so far " + skipped);
|
||||
}
|
||||
if (count % 100 == 0) {
|
||||
log.info("stored so far " + count);
|
||||
}
|
||||
}
|
||||
log.info(String.format("Done! skipped %s, stored %s", skipped, count));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
|
||||
|
||||
public interface HttpApiRepositoryIterable extends RepositoryIterable {
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg.httpapi.kaggle;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.httpapi.HttpApiRepositoryIterable;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class KaggleRepositoryIterable implements HttpApiRepositoryIterable {
|
||||
private static final Log log = LogFactory.getLog(KaggleRepositoryIterable.class);
|
||||
|
||||
public static class Options {
|
||||
private String queryUrl;
|
||||
private String queryPagePlaceholder;
|
||||
private Charset charset;
|
||||
private String responsePropertyTotalDataset;
|
||||
private String responsePropertyDatasetList;
|
||||
private String responsePropertyDatasetUrl;
|
||||
private String responseBaseDatasetUrl;
|
||||
private long putTimeout;
|
||||
private TimeUnit putTimeoutUnit;
|
||||
|
||||
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
|
||||
|
||||
private int queueSize;
|
||||
|
||||
public long getPutTimeout() {
|
||||
return putTimeout;
|
||||
}
|
||||
|
||||
public void setPutTimeout(long putTimeout) {
|
||||
this.putTimeout = putTimeout;
|
||||
}
|
||||
|
||||
public TimeUnit getPutTimeoutUnit() {
|
||||
return putTimeoutUnit;
|
||||
}
|
||||
|
||||
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
|
||||
this.putTimeoutUnit = putTimeoutUnit;
|
||||
}
|
||||
|
||||
public int getQueueSize() {
|
||||
return queueSize;
|
||||
}
|
||||
|
||||
public void setQueueSize(int queueSize) {
|
||||
this.queueSize = queueSize;
|
||||
}
|
||||
|
||||
public String getResponseBaseDatasetUrl() {
|
||||
return responseBaseDatasetUrl;
|
||||
}
|
||||
|
||||
public void setResponseBaseDatasetUrl(String responseBaseDatasetUrl) {
|
||||
this.responseBaseDatasetUrl = responseBaseDatasetUrl;
|
||||
}
|
||||
|
||||
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
|
||||
return repositoryQueueIteratorOptions;
|
||||
}
|
||||
|
||||
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
|
||||
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
|
||||
}
|
||||
|
||||
public String getResponsePropertyDatasetUrl() {
|
||||
return responsePropertyDatasetUrl;
|
||||
}
|
||||
|
||||
public void setResponsePropertyDatasetUrl(String responsePropertyDatasetUrl) {
|
||||
this.responsePropertyDatasetUrl = responsePropertyDatasetUrl;
|
||||
}
|
||||
|
||||
public String getResponsePropertyDatasetList() {
|
||||
return responsePropertyDatasetList;
|
||||
}
|
||||
|
||||
public void setResponsePropertyDatasetList(String responsePropertyDatasetList) {
|
||||
this.responsePropertyDatasetList = responsePropertyDatasetList;
|
||||
}
|
||||
|
||||
public String getResponsePropertyTotalDataset() {
|
||||
return responsePropertyTotalDataset;
|
||||
}
|
||||
|
||||
public void setResponsePropertyTotalDataset(String responsePropertyTotalDataset) {
|
||||
this.responsePropertyTotalDataset = responsePropertyTotalDataset;
|
||||
}
|
||||
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public void setCharset(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public String getQueryPagePlaceholder() {
|
||||
return queryPagePlaceholder;
|
||||
}
|
||||
|
||||
public void setQueryPagePlaceholder(String queryPagePlaceholder) {
|
||||
this.queryPagePlaceholder = queryPagePlaceholder;
|
||||
}
|
||||
|
||||
public String getQueryUrl() {
|
||||
return queryUrl;
|
||||
}
|
||||
|
||||
public void setQueryUrl(String queryUrl) {
|
||||
this.queryUrl = queryUrl;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private ArrayBlockingQueue<String> queue;
|
||||
|
||||
public KaggleRepositoryIterable(Options options) {
|
||||
this.options = options;
|
||||
// this.currentPage = 1;
|
||||
// this.terminated = false;
|
||||
}
|
||||
|
||||
public void bootstrap() {
|
||||
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
|
||||
|
||||
Thread ft = new Thread(new Harvester() );
|
||||
ft.start();
|
||||
// ExecutorService executor = Executors.newSingleThreadExecutor();
|
||||
// executor.execute(new Harvester());
|
||||
// executor.shutdown();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
|
||||
}
|
||||
|
||||
private class Harvester implements Runnable{
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
this.execute();
|
||||
}
|
||||
private void execute() {
|
||||
try {
|
||||
int currentPage = 1;
|
||||
int totalDatasets = 0;
|
||||
int readDatasets = 0;
|
||||
while (true) {
|
||||
String query = options.getQueryUrl().replace(options.getQueryPagePlaceholder(), Integer.toString(currentPage));
|
||||
String response = IOUtils.toString(new URL(query), options.getCharset());
|
||||
currentPage += 1;
|
||||
|
||||
JSONObject pageObject = new JSONObject(response);
|
||||
totalDatasets = pageObject.optInt(options.getResponsePropertyTotalDataset());
|
||||
JSONArray datasets = pageObject.optJSONArray(options.getResponsePropertyDatasetList());
|
||||
|
||||
if (datasets == null || datasets.length() == 0) break;
|
||||
|
||||
readDatasets += datasets.length();
|
||||
|
||||
for (int i = 0; i < datasets.length(); i += 1) {
|
||||
JSONObject item = datasets.optJSONObject(i);
|
||||
String urlFragment = item.optString(options.getResponsePropertyDatasetUrl());
|
||||
if (urlFragment == null || urlFragment.trim().length() == 0) continue;
|
||||
String endpoint = String.format("%s%s", options.getResponseBaseDatasetUrl(), urlFragment);
|
||||
|
||||
log.debug("adding endpoint in queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
|
||||
try {
|
||||
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
||||
} catch (InterruptedException ex) {
|
||||
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
|
||||
break;
|
||||
}
|
||||
log.debug("endpoint added in queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
}
|
||||
|
||||
if (readDatasets >= totalDatasets) break;
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
log.error("problem execution harvesting", ex);
|
||||
} finally {
|
||||
try {
|
||||
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
||||
} catch (Exception ex) {
|
||||
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.*;
|
||||
|
||||
public class SitemapFileIterator implements Iterator<String> {
|
||||
private static final Log log = LogFactory.getLog(SitemapFileIterator.class);
|
||||
|
||||
public static class Options {
|
||||
|
||||
public enum SitemapFileType{
|
||||
Text,
|
||||
GZ
|
||||
}
|
||||
|
||||
public enum SitemapSchemaType{
|
||||
Text,
|
||||
Xml
|
||||
}
|
||||
|
||||
public Options(){}
|
||||
|
||||
public Options(URL fileUrl, Charset charset, SitemapSchemaType schemaType, SitemapFileType fileType) {
|
||||
this.fileUrl = fileUrl;
|
||||
this.charset = charset;
|
||||
this.schemaType = schemaType;
|
||||
this.fileType = fileType;
|
||||
}
|
||||
|
||||
private SitemapFileType fileType;
|
||||
private SitemapSchemaType schemaType;
|
||||
private URL fileUrl;
|
||||
private Charset charset;
|
||||
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public void setCharset(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public URL getFileUrl() {
|
||||
return fileUrl;
|
||||
}
|
||||
|
||||
public void setFileUrl(URL fileUrl) {
|
||||
this.fileUrl = fileUrl;
|
||||
}
|
||||
|
||||
public SitemapFileType getFileType() {
|
||||
return fileType;
|
||||
}
|
||||
|
||||
public void setFileType(SitemapFileType fileType) {
|
||||
this.fileType = fileType;
|
||||
}
|
||||
|
||||
public SitemapSchemaType getSchemaType() {
|
||||
return schemaType;
|
||||
}
|
||||
|
||||
public void setSchemaType(SitemapSchemaType schemaType) {
|
||||
this.schemaType = schemaType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone(){
|
||||
Options clone = new Options();
|
||||
clone.setCharset(this.getCharset());
|
||||
clone.setFileType(this.getFileType());
|
||||
clone.setFileUrl(this.getFileUrl());
|
||||
clone.setSchemaType(this.getSchemaType());
|
||||
return clone;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private File downloadedFile;
|
||||
private File contentFile;
|
||||
private Queue<String> locations;
|
||||
|
||||
public SitemapFileIterator(Options options){
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public void bootstrap() {
|
||||
LinkedList<String> endpoints = null;
|
||||
try {
|
||||
log.debug(String.format("bootstrapping sitemapindex file access for sitemapindex %s", this.options.getFileUrl()));
|
||||
this.downloadedFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
|
||||
this.downloadedFile.deleteOnExit();
|
||||
FileUtils.copyURLToFile(this.options.getFileUrl(), this.downloadedFile);
|
||||
log.debug(String.format("downloaded file: %s has size %d", this.downloadedFile.toString(), this.downloadedFile.length()));
|
||||
|
||||
switch (this.options.getFileType()) {
|
||||
case Text: {
|
||||
this.contentFile = this.downloadedFile;
|
||||
break;
|
||||
}
|
||||
case GZ: {
|
||||
this.contentFile = File.createTempFile(UUID.randomUUID().toString(), ".tmp");
|
||||
this.contentFile.deleteOnExit();
|
||||
Utils.decompressGZipTo(this.downloadedFile, this.contentFile);
|
||||
log.debug(String.format("extracted gz file: %s has size %d", this.contentFile.toString(), this.contentFile.length()));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new CollectorServiceException("unrecognized file type " + this.options.getFileType());
|
||||
}
|
||||
|
||||
List<String> content = this.collectContentLocations();
|
||||
|
||||
log.debug(String.format("extracted %d sitemapindex endpoints", content.size()));
|
||||
endpoints = new LinkedList<>(content);
|
||||
}catch(Exception ex){
|
||||
log.error(String.format("error processing sitemapindex %s. returning 0 endpoints",this.options.getFileUrl()), ex);
|
||||
endpoints = new LinkedList<>();
|
||||
}finally {
|
||||
if (this.contentFile != null) {
|
||||
this.contentFile.delete();
|
||||
}
|
||||
if (this.downloadedFile != null) {
|
||||
this.downloadedFile.delete();
|
||||
}
|
||||
}
|
||||
this.locations = endpoints;
|
||||
}
|
||||
|
||||
private List<String> collectContentLocations() throws Exception{
|
||||
switch(this.options.getSchemaType()) {
|
||||
case Text:{
|
||||
return this.collectTextContentLocations();
|
||||
}
|
||||
case Xml:{
|
||||
return this.collectXmlContentLocations();
|
||||
}
|
||||
default: throw new CollectorServiceException("unrecognized file type "+this.options.getFileType());
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> collectTextContentLocations() throws Exception {
|
||||
log.debug(String.format("reading endpoint locations from text sitemapindex"));
|
||||
try (FileInputStream in = new FileInputStream(this.contentFile)) {
|
||||
return IOUtils.readLines(in, this.options.getCharset());
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> collectXmlContentLocations() throws Exception {
|
||||
log.debug(String.format("reading endpoint locations from xml sitemapindex"));
|
||||
return Utils.collectAsStrings(this.contentFile,"/urlset/url/loc/text()");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return !this.locations.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
return this.locations.poll();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.Utils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.*;
|
||||
|
||||
public class SitemapIndexIterator implements Iterator<String> {
|
||||
private static final Log log = LogFactory.getLog(SitemapIndexIterator.class);
|
||||
|
||||
public static class Options {
|
||||
private URL indexUrl;
|
||||
private Charset charset;
|
||||
|
||||
public Options(){}
|
||||
|
||||
public Options(URL indexUrl, Charset charset){
|
||||
this.indexUrl = indexUrl;
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public URL getIndexUrl() {
|
||||
return indexUrl;
|
||||
}
|
||||
|
||||
public void setIndexUrl(URL indexUrl) {
|
||||
this.indexUrl = indexUrl;
|
||||
}
|
||||
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public void setCharset(Charset charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private Queue<String> sitemapFiles;
|
||||
|
||||
public SitemapIndexIterator(Options options) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public void bootstrap() {
|
||||
List<String> files = null;
|
||||
try {
|
||||
log.debug("bootstrapping sitemapindex index access");
|
||||
String sitemapIndexPayload = Utils.RemoteAccessWithRetry(3, 5000, this.options.getIndexUrl(), this.options.getCharset());
|
||||
log.debug(String.format("sitemapindex payload is: %s", sitemapIndexPayload));
|
||||
files = Utils.collectAsStrings(sitemapIndexPayload, "/sitemapindex/sitemap/loc/text()");
|
||||
log.debug(String.format("extracted %d sitemapindex files", files.size()));
|
||||
}catch(Exception ex){
|
||||
log.error("problem bootstrapping sitemapindex index access. returning 0 files", ex);
|
||||
files = new ArrayList<>();
|
||||
}
|
||||
this.sitemapFiles = new PriorityQueue<String>(files);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return !this.sitemapFiles.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
return this.sitemapFiles.poll();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package eu.dnetlib.data.collector.plugins.schemaorg.sitemapindex;
|
||||
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryIterable;
|
||||
import eu.dnetlib.data.collector.plugins.schemaorg.RepositoryQueueIterator;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SitemapIndexRepositoryIterable implements RepositoryIterable {
|
||||
private static final Log log = LogFactory.getLog(SitemapIndexRepositoryIterable.class);
|
||||
|
||||
public static class Options {
|
||||
private SitemapIndexIterator.Options sitemapIndexIteratorOptions;
|
||||
private SitemapFileIterator.Options sitemapFileIteratorOptions;
|
||||
private RepositoryQueueIterator.Options repositoryQueueIteratorOptions;
|
||||
private long putTimeout;
|
||||
private TimeUnit putTimeoutUnit;
|
||||
|
||||
private int queueSize;
|
||||
|
||||
public long getPutTimeout() {
|
||||
return putTimeout;
|
||||
}
|
||||
|
||||
public void setPutTimeout(long putTimeout) {
|
||||
this.putTimeout = putTimeout;
|
||||
}
|
||||
|
||||
public TimeUnit getPutTimeoutUnit() {
|
||||
return putTimeoutUnit;
|
||||
}
|
||||
|
||||
public void setPutTimeoutUnit(TimeUnit putTimeoutUnit) {
|
||||
this.putTimeoutUnit = putTimeoutUnit;
|
||||
}
|
||||
|
||||
public int getQueueSize() {
|
||||
return queueSize;
|
||||
}
|
||||
|
||||
public void setQueueSize(int queueSize) {
|
||||
this.queueSize = queueSize;
|
||||
}
|
||||
|
||||
public RepositoryQueueIterator.Options getRepositoryQueueIteratorOptions() {
|
||||
return repositoryQueueIteratorOptions;
|
||||
}
|
||||
|
||||
public void setRepositoryQueueIteratorOptions(RepositoryQueueIterator.Options repositoryQueueIteratorOptions) {
|
||||
this.repositoryQueueIteratorOptions = repositoryQueueIteratorOptions;
|
||||
}
|
||||
|
||||
public SitemapIndexIterator.Options getSitemapIndexIteratorOptions() {
|
||||
return sitemapIndexIteratorOptions;
|
||||
}
|
||||
|
||||
public void setSitemapIndexIteratorOptions(SitemapIndexIterator.Options sitemapIndexIteratorOptions) {
|
||||
this.sitemapIndexIteratorOptions = sitemapIndexIteratorOptions;
|
||||
}
|
||||
|
||||
public SitemapFileIterator.Options getSitemapFileIteratorOptions() {
|
||||
return sitemapFileIteratorOptions;
|
||||
}
|
||||
|
||||
public void setSitemapFileIteratorOptions(SitemapFileIterator.Options sitemapFileIteratorOptions) {
|
||||
this.sitemapFileIteratorOptions = sitemapFileIteratorOptions;
|
||||
}
|
||||
}
|
||||
|
||||
private Options options;
|
||||
private ArrayBlockingQueue<String> queue;
|
||||
|
||||
public SitemapIndexRepositoryIterable(Options options) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public void bootstrap() {
|
||||
this.queue = new ArrayBlockingQueue<>(this.options.getQueueSize());
|
||||
|
||||
Thread ft = new Thread(new Harvester() );
|
||||
ft.start();
|
||||
// ExecutorService executor = Executors.newSingleThreadExecutor();
|
||||
// executor.execute(new Harvester());
|
||||
// executor.shutdown();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return new RepositoryQueueIterator(this.options.getRepositoryQueueIteratorOptions(), this.queue);
|
||||
}
|
||||
|
||||
private class Harvester implements Runnable{
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
this.execute();
|
||||
}
|
||||
|
||||
private void execute(){
|
||||
try {
|
||||
SitemapIndexIterator sitemapIndexIterator = new SitemapIndexIterator(options.getSitemapIndexIteratorOptions());
|
||||
sitemapIndexIterator.bootstrap();
|
||||
|
||||
while (sitemapIndexIterator.hasNext()) {
|
||||
String sitemapFile = sitemapIndexIterator.next();
|
||||
if(sitemapFile == null) continue;
|
||||
|
||||
SitemapFileIterator.Options sitemapFileIteratorOptions = (SitemapFileIterator.Options)options.getSitemapFileIteratorOptions().clone();
|
||||
sitemapFileIteratorOptions.setFileUrl(new URL(sitemapFile));
|
||||
SitemapFileIterator sitemapFileIterator = new SitemapFileIterator(sitemapFileIteratorOptions);
|
||||
sitemapFileIterator.bootstrap();
|
||||
|
||||
while(sitemapFileIterator.hasNext()){
|
||||
String endpoint = sitemapFileIterator.next();
|
||||
if(endpoint == null) continue;;
|
||||
|
||||
log.debug("adding endpoint in queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
try {
|
||||
queue.offer(endpoint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
||||
} catch (InterruptedException ex) {
|
||||
log.warn(String.format("could not put elements from queue for more than %s %s. breaking", options.getPutTimeout(), options.getPutTimeoutUnit()));
|
||||
break;
|
||||
}
|
||||
log.debug("endpoint added in queue");
|
||||
log.debug("queue size: " + queue.size());
|
||||
}
|
||||
}
|
||||
}catch(Exception ex){
|
||||
log.error("problem execution harvesting", ex);
|
||||
}
|
||||
finally {
|
||||
try {
|
||||
queue.offer(RepositoryIterable.TerminationHint, options.getPutTimeout(), options.getPutTimeoutUnit());
|
||||
} catch (Exception ex) {
|
||||
log.fatal("could not add termination hint. the process will not terminate gracefully", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package eu.dnetlib.data.collector.plugins.sftp;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.data.collector.plugin.AbstractCollectorPlugin;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceException;
|
||||
import eu.dnetlib.data.collector.rmi.InterfaceDescriptor;
|
||||
|
||||
/**
|
||||
* Created by andrea on 11/01/16.
|
||||
*/
|
||||
public class SftpCollectorPlugin extends AbstractCollectorPlugin {
|
||||
|
||||
private SftpIteratorFactory sftpIteratorFactory;
|
||||
|
||||
@Override
|
||||
public Iterable<String> collect(final InterfaceDescriptor interfaceDescriptor, final String fromDate, final String toDate)
|
||||
throws CollectorServiceException {
|
||||
final String baseUrl = interfaceDescriptor.getBaseUrl();
|
||||
final String username = interfaceDescriptor.getParams().get("username");
|
||||
final String password = interfaceDescriptor.getParams().get("password");
|
||||
final String recursive = interfaceDescriptor.getParams().get("recursive");
|
||||
final String extensions = interfaceDescriptor.getParams().get("extensions");
|
||||
|
||||
if ((baseUrl == null) || baseUrl.isEmpty()) {
|
||||
throw new CollectorServiceException("Param 'baseurl' is null or empty");
|
||||
}
|
||||
if ((username == null) || username.isEmpty()) {
|
||||
throw new CollectorServiceException("Param 'username' is null or empty");
|
||||
}
|
||||
if ((password == null) || password.isEmpty()) {
|
||||
throw new CollectorServiceException("Param 'password' is null or empty");
|
||||
}
|
||||
if ((recursive == null) || recursive.isEmpty()) {
|
||||
throw new CollectorServiceException("Param 'recursive' is null or empty");
|
||||
}
|
||||
if ((extensions == null) || extensions.isEmpty()) {
|
||||
throw new CollectorServiceException("Param 'extensions' is null or empty");
|
||||
}
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new CollectorServiceException("Invalid date (YYYY-MM-DD): " + fromDate); }
|
||||
|
||||
// final int fromDateIntSeconds =
|
||||
|
||||
return new Iterable<String>() {
|
||||
|
||||
boolean isRecursive = "true".equals(recursive);
|
||||
|
||||
Set<String> extensionsSet = parseSet(extensions);
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return getSftpIteratorFactory().newIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
|
||||
}
|
||||
|
||||
private Set<String> parseSet(final String extensions) {
|
||||
return Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(extensions));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public SftpIteratorFactory getSftpIteratorFactory() {
|
||||
return sftpIteratorFactory;
|
||||
}
|
||||
|
||||
public void setSftpIteratorFactory(SftpIteratorFactory sftpIteratorFactory) {
|
||||
this.sftpIteratorFactory = sftpIteratorFactory;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,206 @@
|
|||
package eu.dnetlib.data.collector.plugins.sftp;
|
||||
|
||||
import java.io.OutputStream;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.*;
|
||||
|
||||
import com.jcraft.jsch.*;
|
||||
import eu.dnetlib.data.collector.rmi.CollectorServiceRuntimeException;
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
|
||||
/**
|
||||
* Created by andrea on 11/01/16.
|
||||
*/
|
||||
public class SftpIterator implements Iterator<String> {
|
||||
private static final Log log = LogFactory.getLog(SftpIterator.class);
|
||||
|
||||
private static final int MAX_RETRIES = 5;
|
||||
private static final int DEFAULT_TIMEOUT = 30000;
|
||||
private static final long BACKOFF_MILLIS = 10000;
|
||||
|
||||
private String baseUrl;
|
||||
private String sftpURIScheme;
|
||||
private String sftpServerAddress;
|
||||
private String remoteSftpBasePath;
|
||||
private String username;
|
||||
private String password;
|
||||
private boolean isRecursive;
|
||||
private Set<String> extensionsSet;
|
||||
private boolean incremental;
|
||||
|
||||
private Session sftpSession;
|
||||
private ChannelSftp sftpChannel;
|
||||
|
||||
private Queue<String> queue;
|
||||
|
||||
private DateTime fromDate = null;
|
||||
private DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
|
||||
public SftpIterator(String baseUrl, String username, String password, boolean isRecursive, Set<String> extensionsSet, String fromDate) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
this.isRecursive = isRecursive;
|
||||
this.extensionsSet = extensionsSet;
|
||||
this.incremental = StringUtils.isNotBlank(fromDate);
|
||||
if (incremental) {
|
||||
//I expect fromDate in the format 'yyyy-MM-dd'. See class eu.dnetlib.msro.workflows.nodes.collect.FindDateRangeForIncrementalHarvestingJobNode .
|
||||
this.fromDate = DateTime.parse(fromDate, simpleDateTimeFormatter);
|
||||
log.debug("fromDate string: " + fromDate + " -- parsed: " + this.fromDate.toString());
|
||||
}
|
||||
try {
|
||||
URI sftpServer = new URI(baseUrl);
|
||||
this.sftpURIScheme = sftpServer.getScheme();
|
||||
this.sftpServerAddress = sftpServer.getHost();
|
||||
this.remoteSftpBasePath = sftpServer.getPath();
|
||||
} catch (URISyntaxException e) {
|
||||
throw new CollectorServiceRuntimeException("Bad syntax in the URL " + baseUrl);
|
||||
}
|
||||
|
||||
connectToSftpServer();
|
||||
initializeQueue();
|
||||
}
|
||||
|
||||
private void connectToSftpServer() {
|
||||
JSch jsch = new JSch();
|
||||
|
||||
try {
|
||||
JSch.setConfig("StrictHostKeyChecking", "no");
|
||||
sftpSession = jsch.getSession(username, sftpServerAddress);
|
||||
sftpSession.setPassword(password);
|
||||
sftpSession.connect();
|
||||
|
||||
Channel channel = sftpSession.openChannel(sftpURIScheme);
|
||||
channel.connect();
|
||||
sftpChannel = (ChannelSftp) channel;
|
||||
String pwd = sftpChannel.pwd();
|
||||
log.debug("PWD from server: " + pwd);
|
||||
String fullPath = pwd + remoteSftpBasePath;
|
||||
sftpChannel.cd(fullPath);
|
||||
log.debug("PWD from server 2 after 'cd " + fullPath + "' : " + sftpChannel.pwd());
|
||||
log.info("Connected to SFTP server " + sftpServerAddress);
|
||||
} catch (JSchException e) {
|
||||
throw new CollectorServiceRuntimeException("Unable to connect to remote SFTP server.", e);
|
||||
} catch (SftpException e) {
|
||||
throw new CollectorServiceRuntimeException("Unable to access the base remote path on the SFTP server.", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void disconnectFromSftpServer() {
|
||||
sftpChannel.exit();
|
||||
sftpSession.disconnect();
|
||||
}
|
||||
|
||||
private void initializeQueue() {
|
||||
queue = new LinkedList<String>();
|
||||
log.info(String.format("SFTP collector plugin collecting from %s with recursion = %s, incremental = %s with fromDate=%s", remoteSftpBasePath,
|
||||
isRecursive,
|
||||
incremental, fromDate));
|
||||
listDirectoryRecursive(".", "");
|
||||
}
|
||||
|
||||
private void listDirectoryRecursive(final String parentDir, final String currentDir) {
|
||||
String dirToList = parentDir;
|
||||
if (StringUtils.isNotBlank(currentDir)) {
|
||||
dirToList += "/" + currentDir;
|
||||
}
|
||||
log.debug("PARENT DIR: " + parentDir);
|
||||
log.debug("DIR TO LIST: " + dirToList);
|
||||
try {
|
||||
Vector<ChannelSftp.LsEntry> ls = sftpChannel.ls(dirToList);
|
||||
for (ChannelSftp.LsEntry entry : ls) {
|
||||
String currentFileName = entry.getFilename();
|
||||
if (currentFileName.equals(".") || currentFileName.equals("..")) {
|
||||
// skip parent directory and directory itself
|
||||
continue;
|
||||
}
|
||||
|
||||
SftpATTRS attrs = entry.getAttrs();
|
||||
if (attrs.isDir()) {
|
||||
if (isRecursive) {
|
||||
listDirectoryRecursive(dirToList, currentFileName);
|
||||
}
|
||||
} else {
|
||||
// test the file for extensions compliance and, just in case, add it to the list.
|
||||
for (String ext : extensionsSet) {
|
||||
if (currentFileName.endsWith(ext)) {
|
||||
//test if the file has been changed after the last collection date:
|
||||
if (incremental) {
|
||||
int mTime = attrs.getMTime();
|
||||
//int times are values reduced by the milliseconds, hence we multiply per 1000L
|
||||
DateTime dt = new DateTime(mTime * 1000L);
|
||||
if (dt.isAfter(fromDate)) {
|
||||
queue.add(currentFileName);
|
||||
log.debug(currentFileName + " has changed and must be re-collected");
|
||||
} else {
|
||||
if (log.isDebugEnabled()) {
|
||||
log.debug(currentFileName + " has not changed since last collection");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//if it is not incremental, just add it to the queue
|
||||
queue.add(currentFileName);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SftpException e) {
|
||||
throw new CollectorServiceRuntimeException("Cannot list the sftp remote directory", e);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (queue.isEmpty()) {
|
||||
disconnectFromSftpServer();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
String nextRemotePath = queue.remove();
|
||||
int nRepeat = 0;
|
||||
String fullPathFile = nextRemotePath;
|
||||
while (nRepeat < MAX_RETRIES) {
|
||||
try {
|
||||
OutputStream baos = new ByteArrayOutputStream();
|
||||
sftpChannel.get(nextRemotePath, baos);
|
||||
if (log.isDebugEnabled()) {
|
||||
fullPathFile = sftpChannel.pwd() + "/" + nextRemotePath;
|
||||
log.debug(String.format("Collected file from SFTP: %s%s", sftpServerAddress, fullPathFile));
|
||||
}
|
||||
return baos.toString();
|
||||
} catch (SftpException e) {
|
||||
nRepeat++;
|
||||
log.warn(String.format("An error occurred [%s] for %s%s, retrying.. [retried %s time(s)]", e.getMessage(), sftpServerAddress, fullPathFile,
|
||||
nRepeat));
|
||||
// disconnectFromSftpServer();
|
||||
try {
|
||||
Thread.sleep(BACKOFF_MILLIS);
|
||||
} catch (InterruptedException e1) {
|
||||
log.error(e1);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new CollectorServiceRuntimeException(
|
||||
String.format("Impossible to retrieve FTP file %s after %s retries. Aborting FTP collection.", fullPathFile, nRepeat));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package eu.dnetlib.data.collector.plugins.sftp;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created by andrea on 11/01/16.
|
||||
*/
|
||||
public class SftpIteratorFactory {
|
||||
|
||||
public Iterator<String> newIterator(final String baseUrl,
|
||||
final String username,
|
||||
final String password,
|
||||
final boolean isRecursive,
|
||||
final Set<String> extensionsSet, final String fromDate) {
|
||||
return new SftpIterator(baseUrl, username, password, isRecursive, extensionsSet, fromDate);
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue