From c10770cd3e27296eb8ae094249a9ac0cb8f8a4ce Mon Sep 17 00:00:00 2001 From: luosolo Date: Tue, 12 Mar 2019 15:40:55 +0100 Subject: [PATCH] added module that allows to collect data into HDFS --- dhp-collector-worker/pom.xml | 64 +++++ .../worker/DnetCollectorException.java | 30 ++ .../DnetCollectorWorkerApplication.java | 105 +++++++ .../collector/worker/model/ApiDescriptor.java | 48 ++++ .../worker/plugins/CollectorPlugin.java | 11 + .../plugins/oai/OaiCollectorPlugin.java | 66 +++++ .../worker/plugins/oai/OaiIterator.java | 163 +++++++++++ .../plugins/oai/OaiIteratorFactory.java | 20 ++ .../utils/CollectorPluginEnumerator.java | 24 ++ .../utils/CollectorPluginErrorLogList.java | 19 ++ .../worker/utils/DnetWorkerCollector.java | 14 + .../collector/worker/utils/HttpConnector.java | 226 +++++++++++++++ .../collector/worker/utils/XmlCleaner.java | 259 ++++++++++++++++++ .../src/main/resources/application.properties | 1 + .../DnetCollectorWorkerApplicationTests.java | 45 +++ 15 files changed, 1095 insertions(+) create mode 100644 dhp-collector-worker/pom.xml create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java create mode 100644 dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java create mode 100644 dhp-collector-worker/src/main/resources/application.properties create mode 100644 dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java diff --git a/dhp-collector-worker/pom.xml b/dhp-collector-worker/pom.xml new file mode 100644 index 000000000..aa0e1c917 --- /dev/null +++ b/dhp-collector-worker/pom.xml @@ -0,0 +1,64 @@ + + + + dhp + eu.dnetlib.dhp + 1.0.0-SNAPSHOT + + 4.0.0 + + dhp-collector-worker + + + + org.springframework.boot + spring-boot-starter + + + org.apache.hadoop + hadoop-client + ${dhp.hadoop.version} + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + dom4j + dom4j + + + jaxen + jaxen + + + org.springframework.boot + spring-boot-starter-test + test + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + true + + + + + + + \ No newline at end of file diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java new file mode 100644 index 000000000..bc4287a0d --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java @@ -0,0 +1,30 @@ +package eu.dnetlib.collector.worker; + +public class DnetCollectorException extends Exception { + + /** + * + */ + private static final long serialVersionUID = -290723075076039757L; + + public DnetCollectorException() { + super(); + } + + public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public DnetCollectorException(final String message, final Throwable cause) { + super(message, cause); + } + + public DnetCollectorException(final String message) { + super(message); + } + + public DnetCollectorException(final Throwable cause) { + super(cause); + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java new file mode 100644 index 000000000..b10f11c5f --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java @@ -0,0 +1,105 @@ +package eu.dnetlib.collector.worker; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.CommandLineRunner; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; +import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.net.URI; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +@SpringBootApplication +public class DnetCollectorWorkerApplication implements CommandLineRunner { + + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); + + @Autowired + private CollectorPluginEnumerator collectorPluginEnumerator; + + /** + * + * @param args + */ + public static void main(final String[] args) { + SpringApplication.run(DnetCollectorWorkerApplication.class, args); + } + + /** + * + */ + @Override + public void run(final String... args) throws Exception { + if (args.length == 0) { return; } + if (args.length != 2) { throw new DnetCollectorException("Invalid number of parameters, expected: hdfs_path and json_api_description"); } + + final String hdfsPath = args[0]; + + log.info("hdfsPath ="+hdfsPath); + + final String json = args[1]; + + log.info("json = "+json); + final ObjectMapper jsonMapper = new ObjectMapper(); + final ApiDescriptor api = jsonMapper.readValue(json, ApiDescriptor.class); + + final CollectorPlugin plugin = collectorPluginEnumerator.getPluginByProtocol(api.getProtocol()); + + final String hdfsuri ="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020"; + + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // Set HADOOP user + System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo"); + System.setProperty("hadoop.home.dir", "/"); + //Get the filesystem - HDFS + FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); + + Path hdfswritepath = new Path(hdfsPath); + + log.info("Created path "+hdfswritepath.toString()); + + try(SequenceFile.Writer writer = SequenceFile.createWriter(conf, + SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + + final AtomicInteger counter = new AtomicInteger(0); + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + + plugin.collect(api).forEach(content -> { + + key.set(counter.getAndIncrement()); + value.set(content); + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + + }); + } + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java new file mode 100644 index 000000000..28b7f75d4 --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java @@ -0,0 +1,48 @@ +package eu.dnetlib.collector.worker.model; + +import java.util.HashMap; +import java.util.Map; + +public class ApiDescriptor { + + private String id; + + private String baseUrl; + + private String protocol; + + private Map params = new HashMap<>(); + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Map getParams() { + return params; + } + + public void setParams(final HashMap params) { + this.params = params; + } + + public String getProtocol() { + return protocol; + } + + public void setProtocol(final String protocol) { + this.protocol = protocol; + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java new file mode 100644 index 000000000..5ec1e9a6e --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java @@ -0,0 +1,11 @@ +package eu.dnetlib.collector.worker.plugins; + +import java.util.stream.Stream; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.model.ApiDescriptor; + +public interface CollectorPlugin { + + Stream collect(ApiDescriptor api) throws DnetCollectorException; +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java new file mode 100644 index 000000000..a536eef5c --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java @@ -0,0 +1,66 @@ +package eu.dnetlib.collector.worker.plugins.oai; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; +import eu.dnetlib.collector.worker.utils.DnetWorkerCollector; + +@Component +@DnetWorkerCollector("oai") +public class OaiCollectorPlugin implements CollectorPlugin { + + private static final String FORMAT_PARAM = "format"; + private static final String OAI_SET_PARAM = "set"; + private static final Object OAI_FROM_DATE_PARAM = "fromDate"; + private static final Object AI_UNTIL_DATE_PARAM = "untilDate"; + + @Autowired + private OaiIteratorFactory oaiIteratorFactory; + + @Override + public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + final String baseUrl = api.getBaseUrl(); + final String mdFormat = api.getParams().get(FORMAT_PARAM); + final String setParam = api.getParams().get(OAI_SET_PARAM); + final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); + final String untilDate = api.getParams().get(AI_UNTIL_DATE_PARAM); + + final List sets = new ArrayList<>(); + if (setParam != null) { + sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); + } + if (sets.isEmpty()) { + // If no set is defined, ALL the sets must be harvested + sets.add(""); + } + + if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); } + + if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); } + + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } + + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } + + final Iterator> iters = sets.stream() + .map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .iterator(); + + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); + } +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java new file mode 100644 index 000000000..191b7b596 --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java @@ -0,0 +1,163 @@ +package eu.dnetlib.collector.worker.plugins.oai; + +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.Iterator; +import java.util.Queue; +import java.util.concurrent.PriorityBlockingQueue; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.utils.HttpConnector; +import eu.dnetlib.collector.worker.utils.XmlCleaner; + +public class OaiIterator implements Iterator { + + private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM + + private final Queue queue = new PriorityBlockingQueue<>(); + private final SAXReader reader = new SAXReader(); + + private final String baseUrl; + private final String set; + private final String mdFormat; + private final String fromDate; + private final String untilDate; + private String token; + private boolean started; + private final HttpConnector httpConnector; + + public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, + final HttpConnector httpConnector) { + this.baseUrl = baseUrl; + this.mdFormat = mdFormat; + this.set = set; + this.fromDate = fromDate; + this.untilDate = untilDate; + this.started = false; + this.httpConnector = httpConnector; + } + + private void verifyStarted() { + if (!this.started) { + this.started = true; + try { + this.token = firstPage(); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public boolean hasNext() { + synchronized (queue) { + verifyStarted(); + return !queue.isEmpty(); + } + } + + @Override + public String next() { + synchronized (queue) { + verifyStarted(); + final String res = queue.poll(); + while (queue.isEmpty() && token != null && !token.isEmpty()) { + try { + token = otherPages(token); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + return res; + } + } + + @Override + public void remove() {} + + private String firstPage() throws DnetCollectorException { + try { + String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); + if (set != null && !set.isEmpty()) { + url += "&set=" + URLEncoder.encode(set, "UTF-8"); + } + if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); + } + if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); + } + log.info("Start harvesting using url: " + url); + + return downloadPage(url); + } catch (final UnsupportedEncodingException e) { + throw new DnetCollectorException(e); + } + } + + private String extractResumptionToken(final String xml) { + + final String s = StringUtils.substringAfter(xml, "", " newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector); + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java new file mode 100644 index 000000000..3f5b245ef --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java @@ -0,0 +1,24 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.List; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; + +@Component +public class CollectorPluginEnumerator { + + @Autowired + private List plugins; + + public CollectorPlugin getPluginByProtocol(final String protocol) { + return plugins.stream() + .filter(p -> p.getClass().isAnnotationPresent(DnetWorkerCollector.class)) + .filter(p -> p.getClass().getAnnotation(DnetWorkerCollector.class).value().equalsIgnoreCase(protocol)) + .findFirst() + .get(); + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java new file mode 100644 index 000000000..062f6c7a8 --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java @@ -0,0 +1,19 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.LinkedList; + +public class CollectorPluginErrorLogList extends LinkedList { + + private static final long serialVersionUID = -6925786561303289704L; + + @Override + public String toString() { + String log = new String(); + int index = 0; + for (final String errorMessage : this) { + log += String.format("Retry #%s: %s / ", index++, errorMessage); + } + return log; + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java new file mode 100644 index 000000000..28891c84f --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java @@ -0,0 +1,14 @@ +package eu.dnetlib.collector.worker.utils; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DnetWorkerCollector { + + String value(); + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java new file mode 100644 index 000000000..8a24381e5 --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java @@ -0,0 +1,226 @@ +package eu.dnetlib.collector.worker.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.net.CookiePolicy; +import java.net.HttpURLConnection; +import java.net.URL; +import java.security.GeneralSecurityException; +import java.security.cert.X509Certificate; +import java.util.List; +import java.util.Map; + +import javax.net.ssl.HttpsURLConnection; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.NumberUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.stereotype.Component; + +import eu.dnetlib.collector.worker.DnetCollectorException; + +@Component +public class HttpConnector { + + private static final Log log = LogFactory.getLog(HttpConnector.class); + + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds + + private String responseType = null; + + private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + public HttpConnector() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl + * the URL + * @return the content of the downloaded resource + * @throws CollectorServiceException + * when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl) throws DnetCollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + /** + * Given the URL returns the content as a stream via HTTP GET + * + * @param requestUrl + * the URL + * @return the content of the downloaded resource as InputStream + * @throws CollectorServiceException + * when retrying more than maxNumberOfRetry times + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { + return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + try { + final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + + if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); } + + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; + + try { + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { + log.warn("waiting and repeating request after " + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + errorList.add("503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug("The requested url has been moved to " + newUrl); + errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); + urlConn.disconnect(); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { + log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + Thread.sleep(defaultDelay * 1000); + errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); + + for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (final String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0 + && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws DnetCollectorException { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); } + } + throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + /** + * register for https scheme; this is a workaround and not intended for the use in trusted environments + */ + public void initTrustManager() { + final X509TrustManager tm = new X509TrustManager() { + + @Override + public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} + + @Override + public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] { tm }, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (final GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getDefaultDelay() { + return defaultDelay; + } + + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } + + public String getResponseType() { + return responseType; + } + +} diff --git a/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java new file mode 100644 index 000000000..7d1121a6d --- /dev/null +++ b/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java @@ -0,0 +1,259 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * @author jochen, Andreas Czerniak + * + */ +public class XmlCleaner { + + /** + * Pattern for numeric entities. + */ + private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ + + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | + * [#x10000-#x10FFFF] + */ + private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ + + // Map entities to their unicode equivalent + private static Set goodEntities = new HashSet<>(); + private static Map badEntities = new HashMap<>(); + + static { + // pre-defined XML entities + goodEntities.add("""); //$NON-NLS-1$ // quotation mark + goodEntities.add("&"); //$NON-NLS-1$ // ampersand + goodEntities.add("<"); //$NON-NLS-1$ // less-than sign + goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark + badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark + badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator + badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark + badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator + badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark + badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter + badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half + badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters + badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark + badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave + badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute + badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex + badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde + badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis + badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above + badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE + badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla + badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave + badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute + badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex + badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis + badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave + badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute + badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex + badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis + badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde + badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave + badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute + badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex + badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde + badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis + badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke + badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave + badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute + badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex + badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis + badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute + badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN + badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s + badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave + badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute + badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex + badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde + badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis + badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above + badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla + badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave + badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute + badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex + badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis + badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave + badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute + badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex + badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis + badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde + badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave + badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute + badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex + badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde + badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis + badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke + badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave + badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute + badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex + badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis + badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute + badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn + badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis + } + + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each + * instance of a bare {@literal &}, replace it with {@literal &
+ * } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}. + * + * @param broken + * the string to handle entities + * @return the string with entities appropriately fixed up + */ + static public String cleanAllEntities(final String broken) { + if (broken == null) { return null; } + + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); + + int cleanfrom = 0; + + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } + + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } + + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } + + return working; + } + + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities: + * &amp;, &quot;, &lt; and &gt;. + * + * @param entity + * the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { return entity; } + + final String replace = badEntities.get(entity); + if (replace != null) { return replace; } + + return replace != null ? replace : ""; + } +} diff --git a/dhp-collector-worker/src/main/resources/application.properties b/dhp-collector-worker/src/main/resources/application.properties new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/dhp-collector-worker/src/main/resources/application.properties @@ -0,0 +1 @@ + diff --git a/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java new file mode 100644 index 000000000..24f78f318 --- /dev/null +++ b/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -0,0 +1,45 @@ +package eu.dnetlib.collector.worker; + +import static org.junit.Assert.assertNotNull; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.CommandLineRunner; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.ApplicationContext; +import org.springframework.test.context.junit4.SpringRunner; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class DnetCollectorWorkerApplicationTests { + + @Autowired + private ApplicationContext ctx; + + @Test + public void testFindPlugin() throws Exception { + final CollectorPluginEnumerator collectorPluginEnumerator = ctx.getBean(CollectorPluginEnumerator.class); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); + } + + @Test + public void testCollectionOAI() throws Exception { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + + ObjectMapper mapper = new ObjectMapper(); + + System.out.println(mapper.writeValueAsString(api)); + } + +}