forked from D-Net/dnet-hadoop
added module that allows to collect data into HDFS
This commit is contained in:
parent
fe949276f0
commit
c10770cd3e
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-collector-worker</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>${dhp.hadoop.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-annotations</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<executable>true</executable>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.collector.worker;
|
||||
|
||||
public class DnetCollectorException extends Exception {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -290723075076039757L;
|
||||
|
||||
public DnetCollectorException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
|
||||
super(message, cause, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final String message, final Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public DnetCollectorException(final Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
package eu.dnetlib.collector.worker;
|
||||
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
|
||||
import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@SpringBootApplication
|
||||
public class DnetCollectorWorkerApplication implements CommandLineRunner {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class);
|
||||
|
||||
@Autowired
|
||||
private CollectorPluginEnumerator collectorPluginEnumerator;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param args
|
||||
*/
|
||||
public static void main(final String[] args) {
|
||||
SpringApplication.run(DnetCollectorWorkerApplication.class, args);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@Override
|
||||
public void run(final String... args) throws Exception {
|
||||
if (args.length == 0) { return; }
|
||||
if (args.length != 2) { throw new DnetCollectorException("Invalid number of parameters, expected: hdfs_path and json_api_description"); }
|
||||
|
||||
final String hdfsPath = args[0];
|
||||
|
||||
log.info("hdfsPath ="+hdfsPath);
|
||||
|
||||
final String json = args[1];
|
||||
|
||||
log.info("json = "+json);
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
final ApiDescriptor api = jsonMapper.readValue(json, ApiDescriptor.class);
|
||||
|
||||
final CollectorPlugin plugin = collectorPluginEnumerator.getPluginByProtocol(api.getProtocol());
|
||||
|
||||
final String hdfsuri ="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020";
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
// Set HADOOP user
|
||||
System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo");
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
//Get the filesystem - HDFS
|
||||
FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf);
|
||||
|
||||
Path hdfswritepath = new Path(hdfsPath);
|
||||
|
||||
log.info("Created path "+hdfswritepath.toString());
|
||||
|
||||
try(SequenceFile.Writer writer = SequenceFile.createWriter(conf,
|
||||
SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
final AtomicInteger counter = new AtomicInteger(0);
|
||||
final IntWritable key = new IntWritable(counter.get());
|
||||
final Text value = new Text();
|
||||
|
||||
plugin.collect(api).forEach(content -> {
|
||||
|
||||
key.set(counter.getAndIncrement());
|
||||
value.set(content);
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.collector.worker.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class ApiDescriptor {
|
||||
|
||||
private String id;
|
||||
|
||||
private String baseUrl;
|
||||
|
||||
private String protocol;
|
||||
|
||||
private Map<String, String> params = new HashMap<>();
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(final String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Map<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final HashMap<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public String getProtocol() {
|
||||
return protocol;
|
||||
}
|
||||
|
||||
public void setProtocol(final String protocol) {
|
||||
this.protocol = protocol;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package eu.dnetlib.collector.worker.plugins;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.collector.worker.DnetCollectorException;
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
|
||||
public interface CollectorPlugin {
|
||||
|
||||
Stream<String> collect(ApiDescriptor api) throws DnetCollectorException;
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.collector.worker.plugins.oai;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterators;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.collector.worker.DnetCollectorException;
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
|
||||
import eu.dnetlib.collector.worker.utils.DnetWorkerCollector;
|
||||
|
||||
@Component
|
||||
@DnetWorkerCollector("oai")
|
||||
public class OaiCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final String FORMAT_PARAM = "format";
|
||||
private static final String OAI_SET_PARAM = "set";
|
||||
private static final Object OAI_FROM_DATE_PARAM = "fromDate";
|
||||
private static final Object AI_UNTIL_DATE_PARAM = "untilDate";
|
||||
|
||||
@Autowired
|
||||
private OaiIteratorFactory oaiIteratorFactory;
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api) throws DnetCollectorException {
|
||||
final String baseUrl = api.getBaseUrl();
|
||||
final String mdFormat = api.getParams().get(FORMAT_PARAM);
|
||||
final String setParam = api.getParams().get(OAI_SET_PARAM);
|
||||
final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM);
|
||||
final String untilDate = api.getParams().get(AI_UNTIL_DATE_PARAM);
|
||||
|
||||
final List<String> sets = new ArrayList<>();
|
||||
if (setParam != null) {
|
||||
sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam)));
|
||||
}
|
||||
if (sets.isEmpty()) {
|
||||
// If no set is defined, ALL the sets must be harvested
|
||||
sets.add("");
|
||||
}
|
||||
|
||||
if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); }
|
||||
|
||||
if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); }
|
||||
|
||||
if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); }
|
||||
|
||||
if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); }
|
||||
|
||||
final Iterator<Iterator<String>> iters = sets.stream()
|
||||
.map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate))
|
||||
.iterator();
|
||||
|
||||
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,163 @@
|
|||
package eu.dnetlib.collector.worker.plugins.oai;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
import eu.dnetlib.collector.worker.DnetCollectorException;
|
||||
import eu.dnetlib.collector.worker.utils.HttpConnector;
|
||||
import eu.dnetlib.collector.worker.utils.XmlCleaner;
|
||||
|
||||
public class OaiIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM
|
||||
|
||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||
private final SAXReader reader = new SAXReader();
|
||||
|
||||
private final String baseUrl;
|
||||
private final String set;
|
||||
private final String mdFormat;
|
||||
private final String fromDate;
|
||||
private final String untilDate;
|
||||
private String token;
|
||||
private boolean started;
|
||||
private final HttpConnector httpConnector;
|
||||
|
||||
public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate,
|
||||
final HttpConnector httpConnector) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.mdFormat = mdFormat;
|
||||
this.set = set;
|
||||
this.fromDate = fromDate;
|
||||
this.untilDate = untilDate;
|
||||
this.started = false;
|
||||
this.httpConnector = httpConnector;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (!this.started) {
|
||||
this.started = true;
|
||||
try {
|
||||
this.token = firstPage();
|
||||
} catch (final DnetCollectorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
return !queue.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (queue) {
|
||||
verifyStarted();
|
||||
final String res = queue.poll();
|
||||
while (queue.isEmpty() && token != null && !token.isEmpty()) {
|
||||
try {
|
||||
token = otherPages(token);
|
||||
} catch (final DnetCollectorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
private String firstPage() throws DnetCollectorException {
|
||||
try {
|
||||
String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8");
|
||||
if (set != null && !set.isEmpty()) {
|
||||
url += "&set=" + URLEncoder.encode(set, "UTF-8");
|
||||
}
|
||||
if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
url += "&from=" + URLEncoder.encode(fromDate, "UTF-8");
|
||||
}
|
||||
if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
|
||||
url += "&until=" + URLEncoder.encode(untilDate, "UTF-8");
|
||||
}
|
||||
log.info("Start harvesting using url: " + url);
|
||||
|
||||
return downloadPage(url);
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String extractResumptionToken(final String xml) {
|
||||
|
||||
final String s = StringUtils.substringAfter(xml, "<resumptionToken");
|
||||
if (s == null) { return null; }
|
||||
|
||||
final String result = StringUtils.substringBetween(s, ">", "</");
|
||||
if (result == null) { return null; }
|
||||
return result.trim();
|
||||
|
||||
}
|
||||
|
||||
private String otherPages(final String resumptionToken) throws DnetCollectorException {
|
||||
try {
|
||||
return downloadPage(baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8"));
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String downloadPage(final String url) throws DnetCollectorException {
|
||||
|
||||
final String xml = httpConnector.getInputSource(url);
|
||||
Document doc;
|
||||
try {
|
||||
doc = reader.read(new StringReader(xml));
|
||||
} catch (final DocumentException e) {
|
||||
log.warn("Error parsing xml, I try to clean it: " + xml, e);
|
||||
final String cleaned = XmlCleaner.cleanAllEntities(xml);
|
||||
try {
|
||||
doc = reader.read(new StringReader(cleaned));
|
||||
} catch (final DocumentException e1) {
|
||||
final String resumptionToken = extractResumptionToken(xml);
|
||||
if (resumptionToken == null) { throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1); }
|
||||
return resumptionToken;
|
||||
}
|
||||
}
|
||||
|
||||
final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']");
|
||||
if (errorNode != null) {
|
||||
final String code = errorNode.valueOf("@code");
|
||||
if ("noRecordsMatch".equalsIgnoreCase(code.trim())) {
|
||||
log.warn("noRecordsMatch for oai call: " + url);
|
||||
return null;
|
||||
} else {
|
||||
throw new DnetCollectorException(code + " - " + errorNode.getText());
|
||||
}
|
||||
}
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) {
|
||||
queue.add(((Node) o).asXML());
|
||||
}
|
||||
|
||||
return doc.valueOf("//*[local-name()='resumptionToken']");
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package eu.dnetlib.collector.worker.plugins.oai;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import eu.dnetlib.collector.worker.utils.HttpConnector;
|
||||
|
||||
@Component
|
||||
public class OaiIteratorFactory {
|
||||
|
||||
@Autowired
|
||||
private HttpConnector httpConnector;
|
||||
|
||||
public Iterator<String> newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) {
|
||||
return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.collector.worker.utils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import eu.dnetlib.collector.worker.plugins.CollectorPlugin;
|
||||
|
||||
@Component
|
||||
public class CollectorPluginEnumerator {
|
||||
|
||||
@Autowired
|
||||
private List<CollectorPlugin> plugins;
|
||||
|
||||
public CollectorPlugin getPluginByProtocol(final String protocol) {
|
||||
return plugins.stream()
|
||||
.filter(p -> p.getClass().isAnnotationPresent(DnetWorkerCollector.class))
|
||||
.filter(p -> p.getClass().getAnnotation(DnetWorkerCollector.class).value().equalsIgnoreCase(protocol))
|
||||
.findFirst()
|
||||
.get();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.collector.worker.utils;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class CollectorPluginErrorLogList extends LinkedList<String> {
|
||||
|
||||
private static final long serialVersionUID = -6925786561303289704L;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String log = new String();
|
||||
int index = 0;
|
||||
for (final String errorMessage : this) {
|
||||
log += String.format("Retry #%s: %s / ", index++, errorMessage);
|
||||
}
|
||||
return log;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package eu.dnetlib.collector.worker.utils;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface DnetWorkerCollector {
|
||||
|
||||
String value();
|
||||
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
package eu.dnetlib.collector.worker.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.CookieManager;
|
||||
import java.net.CookiePolicy;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.security.GeneralSecurityException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.net.ssl.HttpsURLConnection;
|
||||
import javax.net.ssl.SSLContext;
|
||||
import javax.net.ssl.TrustManager;
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.math.NumberUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import eu.dnetlib.collector.worker.DnetCollectorException;
|
||||
|
||||
@Component
|
||||
public class HttpConnector {
|
||||
|
||||
private static final Log log = LogFactory.getLog(HttpConnector.class);
|
||||
|
||||
private int maxNumberOfRetry = 6;
|
||||
private int defaultDelay = 120; // seconds
|
||||
private int readTimeOut = 120; // seconds
|
||||
|
||||
private String responseType = null;
|
||||
|
||||
private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
|
||||
|
||||
public HttpConnector() {
|
||||
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content via HTTP GET
|
||||
*
|
||||
* @param requestUrl
|
||||
* the URL
|
||||
* @return the content of the downloaded resource
|
||||
* @throws CollectorServiceException
|
||||
* when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public String getInputSource(final String requestUrl) throws DnetCollectorException {
|
||||
return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the URL returns the content as a stream via HTTP GET
|
||||
*
|
||||
* @param requestUrl
|
||||
* the URL
|
||||
* @return the content of the downloaded resource as InputStream
|
||||
* @throws CollectorServiceException
|
||||
* when retrying more than maxNumberOfRetry times
|
||||
*/
|
||||
public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException {
|
||||
return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
}
|
||||
|
||||
private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws DnetCollectorException {
|
||||
try {
|
||||
final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList());
|
||||
try {
|
||||
return IOUtils.toString(s);
|
||||
} catch (final IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(s);
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList)
|
||||
throws DnetCollectorException {
|
||||
|
||||
if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); }
|
||||
|
||||
log.debug("Downloading " + requestUrl + " - try: " + retryNumber);
|
||||
try {
|
||||
InputStream input = null;
|
||||
|
||||
try {
|
||||
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
|
||||
urlConn.setInstanceFollowRedirects(false);
|
||||
urlConn.setReadTimeout(readTimeOut * 1000);
|
||||
urlConn.addRequestProperty("User-Agent", userAgent);
|
||||
|
||||
if (log.isDebugEnabled()) {
|
||||
logHeaderFields(urlConn);
|
||||
}
|
||||
|
||||
final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
|
||||
if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) {
|
||||
log.warn("waiting and repeating request after " + retryAfter + " sec.");
|
||||
Thread.sleep(retryAfter * 1000);
|
||||
errorList.add("503 Service Unavailable");
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) {
|
||||
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
|
||||
log.debug("The requested url has been moved to " + newUrl);
|
||||
errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(newUrl, retryNumber + 1, errorList);
|
||||
} else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) {
|
||||
log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()));
|
||||
urlConn.disconnect();
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
} else {
|
||||
input = urlConn.getInputStream();
|
||||
responseType = urlConn.getContentType();
|
||||
return input;
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
log.error("error while retrieving from http-connection occured: " + requestUrl, e);
|
||||
Thread.sleep(defaultDelay * 1000);
|
||||
errorList.add(e.getMessage());
|
||||
return attemptDownload(requestUrl, retryNumber + 1, errorList);
|
||||
}
|
||||
} catch (final InterruptedException e) {
|
||||
throw new DnetCollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
|
||||
log.debug("StatusCode: " + urlConn.getResponseMessage());
|
||||
|
||||
for (final Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
|
||||
if (e.getKey() != null) {
|
||||
for (final String v : e.getValue()) {
|
||||
log.debug(" key: " + e.getKey() + " - value: " + v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
|
||||
for (final String key : headerMap.keySet()) {
|
||||
if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0
|
||||
&& NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; }
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws DnetCollectorException {
|
||||
for (final String key : headerMap.keySet()) {
|
||||
if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); }
|
||||
}
|
||||
throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING");
|
||||
}
|
||||
|
||||
/**
|
||||
* register for https scheme; this is a workaround and not intended for the use in trusted environments
|
||||
*/
|
||||
public void initTrustManager() {
|
||||
final X509TrustManager tm = new X509TrustManager() {
|
||||
|
||||
@Override
|
||||
public void checkClientTrusted(final X509Certificate[] xcs, final String string) {}
|
||||
|
||||
@Override
|
||||
public void checkServerTrusted(final X509Certificate[] xcs, final String string) {}
|
||||
|
||||
@Override
|
||||
public X509Certificate[] getAcceptedIssuers() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
try {
|
||||
final SSLContext ctx = SSLContext.getInstance("TLS");
|
||||
ctx.init(null, new TrustManager[] { tm }, null);
|
||||
HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory());
|
||||
} catch (final GeneralSecurityException e) {
|
||||
log.fatal(e);
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int getMaxNumberOfRetry() {
|
||||
return maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public void setMaxNumberOfRetry(final int maxNumberOfRetry) {
|
||||
this.maxNumberOfRetry = maxNumberOfRetry;
|
||||
}
|
||||
|
||||
public int getDefaultDelay() {
|
||||
return defaultDelay;
|
||||
}
|
||||
|
||||
public void setDefaultDelay(final int defaultDelay) {
|
||||
this.defaultDelay = defaultDelay;
|
||||
}
|
||||
|
||||
public int getReadTimeOut() {
|
||||
return readTimeOut;
|
||||
}
|
||||
|
||||
public void setReadTimeOut(final int readTimeOut) {
|
||||
this.readTimeOut = readTimeOut;
|
||||
}
|
||||
|
||||
public String getResponseType() {
|
||||
return responseType;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,259 @@
|
|||
package eu.dnetlib.collector.worker.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* @author jochen, Andreas Czerniak
|
||||
*
|
||||
*/
|
||||
public class XmlCleaner {
|
||||
|
||||
/**
|
||||
* Pattern for numeric entities.
|
||||
*/
|
||||
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
|
||||
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
|
||||
|
||||
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
|
||||
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
|
||||
|
||||
/**
|
||||
* Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
|
||||
* [#x10000-#x10FFFF]
|
||||
*/
|
||||
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
|
||||
|
||||
// Map entities to their unicode equivalent
|
||||
private static Set<String> goodEntities = new HashSet<>();
|
||||
private static Map<String, String> badEntities = new HashMap<>();
|
||||
|
||||
static {
|
||||
// pre-defined XML entities
|
||||
goodEntities.add("""); //$NON-NLS-1$ // quotation mark
|
||||
goodEntities.add("&"); //$NON-NLS-1$ // ampersand
|
||||
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign
|
||||
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign
|
||||
// control entities
|
||||
// badEntities.put("", "");
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
// misc entities
|
||||
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
|
||||
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
|
||||
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
|
||||
// Latin 1 entities
|
||||
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
|
||||
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
|
||||
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
|
||||
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
|
||||
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
|
||||
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
|
||||
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
|
||||
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
|
||||
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
|
||||
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
|
||||
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
|
||||
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
|
||||
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
|
||||
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
|
||||
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
|
||||
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
|
||||
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
|
||||
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
|
||||
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
|
||||
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
|
||||
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
|
||||
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
|
||||
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
|
||||
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
|
||||
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
|
||||
badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
|
||||
badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
|
||||
badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
|
||||
badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
|
||||
badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
|
||||
badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
|
||||
badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
|
||||
badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
|
||||
badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
|
||||
badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
|
||||
badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
|
||||
badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
|
||||
badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
|
||||
badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
|
||||
badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
|
||||
badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
|
||||
badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
|
||||
badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
|
||||
badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
|
||||
badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
|
||||
badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
|
||||
badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
|
||||
badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
|
||||
badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
|
||||
badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
|
||||
badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
|
||||
badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
|
||||
badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
|
||||
badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
|
||||
badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
|
||||
badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
|
||||
badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
|
||||
badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
|
||||
badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
|
||||
badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
|
||||
badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
|
||||
badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
|
||||
badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
|
||||
badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
|
||||
badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
|
||||
badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
|
||||
badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
|
||||
badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
|
||||
badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
|
||||
badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
|
||||
badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
|
||||
badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
|
||||
badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
|
||||
badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
|
||||
badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
|
||||
badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
|
||||
badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
|
||||
badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
|
||||
badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
|
||||
badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
|
||||
badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
|
||||
badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
|
||||
badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
|
||||
badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
|
||||
badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
|
||||
badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
|
||||
badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
|
||||
badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
|
||||
badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
|
||||
badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
|
||||
badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
|
||||
badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
|
||||
badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
|
||||
badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
|
||||
badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
|
||||
badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each
|
||||
* instance of a bare {@literal &}, replace it with {@literal &<br/>
|
||||
* } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}.
|
||||
*
|
||||
* @param broken
|
||||
* the string to handle entities
|
||||
* @return the string with entities appropriately fixed up
|
||||
*/
|
||||
static public String cleanAllEntities(final String broken) {
|
||||
if (broken == null) { return null; }
|
||||
|
||||
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
|
||||
working = invalidCharacterPattern.matcher(working).replaceAll("");
|
||||
|
||||
int cleanfrom = 0;
|
||||
|
||||
while (true) {
|
||||
int amp = working.indexOf('&', cleanfrom);
|
||||
// If there are no more amps then we are done
|
||||
if (amp == -1) {
|
||||
break;
|
||||
}
|
||||
// Skip references of the kind &#ddd;
|
||||
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
|
||||
cleanfrom = working.indexOf(';', amp) + 1;
|
||||
continue;
|
||||
}
|
||||
int i = amp + 1;
|
||||
while (true) {
|
||||
// if we are at the end of the string then just escape the '&';
|
||||
if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
}
|
||||
// if we have come to a ; then we have an entity
|
||||
// If it is something that xml can't handle then replace it.
|
||||
final char c = working.charAt(i);
|
||||
if (c == ';') {
|
||||
final String entity = working.substring(amp, i + 1);
|
||||
final String replace = handleEntity(entity);
|
||||
working = working.substring(0, amp) + replace + working.substring(i + 1);
|
||||
break;
|
||||
}
|
||||
// Did we end an entity without finding a closing ;
|
||||
// Then treat it as an '&' that needs to be replaced with &
|
||||
if (!Character.isLetterOrDigit(c)) {
|
||||
working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
amp = i + 4; // account for the 4 extra characters
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
cleanfrom = amp + 1;
|
||||
}
|
||||
|
||||
if (Pattern.compile("<<").matcher(working).find()) {
|
||||
working = working.replaceAll("<<", "<<");
|
||||
}
|
||||
|
||||
if (Pattern.compile(">>").matcher(working).find()) {
|
||||
working = working.replaceAll(">>", ">>");
|
||||
}
|
||||
|
||||
return working;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities:
|
||||
* &amp;, &quot;, &lt; and &gt;.
|
||||
*
|
||||
* @param entity
|
||||
* the entity to be replaced
|
||||
* @return the substitution for the entity, either itself, the unicode equivalent or an empty string.
|
||||
*/
|
||||
private static String handleEntity(final String entity) {
|
||||
if (goodEntities.contains(entity)) { return entity; }
|
||||
|
||||
final String replace = badEntities.get(entity);
|
||||
if (replace != null) { return replace; }
|
||||
|
||||
return replace != null ? replace : "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.collector.worker;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.CommandLineRunner;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.collector.worker.model.ApiDescriptor;
|
||||
import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest
|
||||
public class DnetCollectorWorkerApplicationTests {
|
||||
|
||||
@Autowired
|
||||
private ApplicationContext ctx;
|
||||
|
||||
@Test
|
||||
public void testFindPlugin() throws Exception {
|
||||
final CollectorPluginEnumerator collectorPluginEnumerator = ctx.getBean(CollectorPluginEnumerator.class);
|
||||
assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai"));
|
||||
assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCollectionOAI() throws Exception {
|
||||
final ApiDescriptor api = new ApiDescriptor();
|
||||
api.setId("oai");
|
||||
api.setProtocol("oai");
|
||||
api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai");
|
||||
api.getParams().put("format", "oai_dc");
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
System.out.println(mapper.writeValueAsString(api));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue