From c2ecbf557259d709be22888a17f0fe77c904a4bb Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 3 Apr 2019 16:03:36 +0200 Subject: [PATCH] moved collector worker --- .../dhp/collection/TransformSparkJobNode.java | 4 + .../dhp/transformation/TransformFunction.java | 13 + dhp-workflows/dhp-collector-worker/README.md | 16 -- dhp-workflows/dhp-collector-worker/pom.xml | 92 ------- .../worker/DnetCollectorException.java | 30 -- .../DnetCollectorWorkerApplication.java | 121 -------- .../collector/worker/model/ApiDescriptor.java | 50 ---- .../worker/plugins/CollectorPlugin.java | 11 - .../plugins/oai/OaiCollectorPlugin.java | 66 ----- .../worker/plugins/oai/OaiIterator.java | 163 ----------- .../plugins/oai/OaiIteratorFactory.java | 20 -- .../utils/CollectorPluginEnumerator.java | 24 -- .../utils/CollectorPluginErrorLogList.java | 19 -- .../worker/utils/DnetWorkerCollector.java | 14 - .../collector/worker/utils/HttpConnector.java | 226 --------------- .../collector/worker/utils/XmlCleaner.java | 259 ------------------ .../src/main/resources/application.properties | 1 - .../DnetCollectorWorkerApplicationTests.java | 45 --- .../target/classes/application.properties | 1 - 19 files changed, 17 insertions(+), 1158 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java delete mode 100644 dhp-workflows/dhp-collector-worker/README.md delete mode 100644 dhp-workflows/dhp-collector-worker/pom.xml delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java delete mode 100644 dhp-workflows/dhp-collector-worker/src/main/resources/application.properties delete mode 100644 dhp-workflows/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java delete mode 100644 dhp-workflows/dhp-collector-worker/target/classes/application.properties diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java new file mode 100644 index 000000000..2e1d53aa4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java @@ -0,0 +1,4 @@ +package eu.dnetlib.dhp.collection; + +public class TransformSparkJobNode { +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java new file mode 100644 index 000000000..393db48ba --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java @@ -0,0 +1,13 @@ +package eu.dnetlib.dhp.collection; + +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import org.apache.spark.api.java.function.MapFunction; + +public class TransformFunction implements MapFunction { + + + @Override + public MetadataRecord call(MetadataRecord value) throws Exception { + return null; + } +} diff --git a/dhp-workflows/dhp-collector-worker/README.md b/dhp-workflows/dhp-collector-worker/README.md deleted file mode 100644 index b6609179d..000000000 --- a/dhp-workflows/dhp-collector-worker/README.md +++ /dev/null @@ -1,16 +0,0 @@ -Description of the Module --------------------------- -This module defines a **collector worker application** that run on Hadoop. - -It is responsible of harvesting metadata using different plugin, that should be passed as arguments -in the main class - -##Plugins -* OAI Plugin - - -## Api Descriptor -TODO - -## Usage -TODO \ No newline at end of file diff --git a/dhp-workflows/dhp-collector-worker/pom.xml b/dhp-workflows/dhp-collector-worker/pom.xml deleted file mode 100644 index 27fae6961..000000000 --- a/dhp-workflows/dhp-collector-worker/pom.xml +++ /dev/null @@ -1,92 +0,0 @@ - - - - org.springframework.boot - spring-boot-starter-parent - 2.1.3.RELEASE - - 4.0.0 - - eu.dnetlib - dhp-collector-worker - 1.0.0 - - - - - cloudera - Cloudera Repository - https://repository.cloudera.com/artifactory/cloudera-repos - - true - - - false - - - - - - - org.springframework.boot - spring-boot-starter - - - org.apache.hadoop - hadoop-client - 2.6.0-cdh5.9.2 - - - com.fasterxml.jackson.core - jackson-core - - - com.fasterxml.jackson.core - jackson-annotations - - - com.fasterxml.jackson.core - jackson-databind - - - dom4j - dom4j - - - jaxen - jaxen - - - org.springframework.boot - spring-boot-starter-test - test - - - - - - - - org.springframework.boot - spring-boot-maven-plugin - - true - - - - - - - - UTF-8 - UTF-8 - cdh5.9.2 - 2.6.0-${dhp.cdh.version} - 2.2.0 - 2.11.8 - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java deleted file mode 100644 index bc4287a0d..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java +++ /dev/null @@ -1,30 +0,0 @@ -package eu.dnetlib.collector.worker; - -public class DnetCollectorException extends Exception { - - /** - * - */ - private static final long serialVersionUID = -290723075076039757L; - - public DnetCollectorException() { - super(); - } - - public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } - - public DnetCollectorException(final String message, final Throwable cause) { - super(message, cause); - } - - public DnetCollectorException(final String message) { - super(message); - } - - public DnetCollectorException(final Throwable cause) { - super(cause); - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java deleted file mode 100644 index c2854b8f2..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java +++ /dev/null @@ -1,121 +0,0 @@ -package eu.dnetlib.collector.worker; - -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.collector.worker.plugins.CollectorPlugin; -import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.CommandLineRunner; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; - -import java.io.IOException; -import java.net.URI; -import java.util.concurrent.atomic.AtomicInteger; - - -/** - * - * DnetCollectortWorkerApplication is the main class responsible to start - * the Dnet Collection into HDFS. - * This module will be executed on the hadoop cluster and taking in input some parameters - * that tells it which is the right collector plugin to use and where store the data into HDFS path - * - * - * @author Sandro La Bruzzo - */ -@SpringBootApplication -public class DnetCollectorWorkerApplication implements CommandLineRunner { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - - @Autowired - private CollectorPluginEnumerator collectorPluginEnumerator; - - /** - * - * @param args - */ - public static void main(final String[] args) { - SpringApplication.run(DnetCollectorWorkerApplication.class, args); - } - - /** - * This module expect two arguments: - * path hdfs where store the sequential file. - * Json serialization of {@link ApiDescriptor} - */ - @Override - public void run(final String... args) throws Exception { - if (args.length == 0) { return; } - if (args.length != 3) { throw new DnetCollectorException("Invalid number of parameters, expected: hdfs_path and json_api_description"); } - - //TODO : migrate to https://commons.apache.org/proper/commons-cli/usage.html - - - - - final String hdfsPath = args[0]; - - log.info("hdfsPath ="+hdfsPath); - - final String json = args[1]; - - - final String nameNode = args[2]; - - log.info("json = "+json); - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(json, ApiDescriptor.class); - - final CollectorPlugin plugin = collectorPluginEnumerator.getPluginByProtocol(api.getProtocol()); - - final String hdfsuri =nameNode; - - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo"); - System.setProperty("hadoop.home.dir", "/"); - //Get the filesystem - HDFS - FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(hdfsPath); - - log.info("Created path "+hdfswritepath.toString()); - - try(SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - - final AtomicInteger counter = new AtomicInteger(0); - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - - plugin.collect(api).forEach(content -> { - - key.set(counter.getAndIncrement()); - value.set(content); - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - - }); - } - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java deleted file mode 100644 index 6874d97f4..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ /dev/null @@ -1,50 +0,0 @@ -package eu.dnetlib.collector.worker.model; - -import java.util.HashMap; -import java.util.Map; - - -//TODO sholud be moved on dhp-common -public class ApiDescriptor { - - private String id; - - private String baseUrl; - - private String protocol; - - private Map params = new HashMap<>(); - - public String getBaseUrl() { - return baseUrl; - } - - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public Map getParams() { - return params; - } - - public void setParams(final HashMap params) { - this.params = params; - } - - public String getProtocol() { - return protocol; - } - - public void setProtocol(final String protocol) { - this.protocol = protocol; - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java deleted file mode 100644 index 5ec1e9a6e..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java +++ /dev/null @@ -1,11 +0,0 @@ -package eu.dnetlib.collector.worker.plugins; - -import java.util.stream.Stream; - -import eu.dnetlib.collector.worker.DnetCollectorException; -import eu.dnetlib.collector.worker.model.ApiDescriptor; - -public interface CollectorPlugin { - - Stream collect(ApiDescriptor api) throws DnetCollectorException; -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java deleted file mode 100644 index 7c4bec192..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java +++ /dev/null @@ -1,66 +0,0 @@ -package eu.dnetlib.collector.worker.plugins.oai; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Spliterator; -import java.util.Spliterators; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; - -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - -import com.google.common.base.Splitter; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; - -import eu.dnetlib.collector.worker.DnetCollectorException; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.collector.worker.plugins.CollectorPlugin; -import eu.dnetlib.collector.worker.utils.DnetWorkerCollector; - -@Component -@DnetWorkerCollector("oai") -public class OaiCollectorPlugin implements CollectorPlugin { - - private static final String FORMAT_PARAM = "format"; - private static final String OAI_SET_PARAM = "set"; - private static final Object OAI_FROM_DATE_PARAM = "fromDate"; - private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; - - @Autowired - private OaiIteratorFactory oaiIteratorFactory; - - @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { - final String baseUrl = api.getBaseUrl(); - final String mdFormat = api.getParams().get(FORMAT_PARAM); - final String setParam = api.getParams().get(OAI_SET_PARAM); - final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); - final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); - - final List sets = new ArrayList<>(); - if (setParam != null) { - sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); - } - if (sets.isEmpty()) { - // If no set is defined, ALL the sets must be harvested - sets.add(""); - } - - if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); } - - if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); } - - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } - - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } - - final Iterator> iters = sets.stream() - .map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) - .iterator(); - - return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); - } -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java deleted file mode 100644 index 191b7b596..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java +++ /dev/null @@ -1,163 +0,0 @@ -package eu.dnetlib.collector.worker.plugins.oai; - -import java.io.StringReader; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.util.Iterator; -import java.util.Queue; -import java.util.concurrent.PriorityBlockingQueue; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; - -import eu.dnetlib.collector.worker.DnetCollectorException; -import eu.dnetlib.collector.worker.utils.HttpConnector; -import eu.dnetlib.collector.worker.utils.XmlCleaner; - -public class OaiIterator implements Iterator { - - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM - - private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); - - private final String baseUrl; - private final String set; - private final String mdFormat; - private final String fromDate; - private final String untilDate; - private String token; - private boolean started; - private final HttpConnector httpConnector; - - public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, - final HttpConnector httpConnector) { - this.baseUrl = baseUrl; - this.mdFormat = mdFormat; - this.set = set; - this.fromDate = fromDate; - this.untilDate = untilDate; - this.started = false; - this.httpConnector = httpConnector; - } - - private void verifyStarted() { - if (!this.started) { - this.started = true; - try { - this.token = firstPage(); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - } - - @Override - public boolean hasNext() { - synchronized (queue) { - verifyStarted(); - return !queue.isEmpty(); - } - } - - @Override - public String next() { - synchronized (queue) { - verifyStarted(); - final String res = queue.poll(); - while (queue.isEmpty() && token != null && !token.isEmpty()) { - try { - token = otherPages(token); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - return res; - } - } - - @Override - public void remove() {} - - private String firstPage() throws DnetCollectorException { - try { - String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); - if (set != null && !set.isEmpty()) { - url += "&set=" + URLEncoder.encode(set, "UTF-8"); - } - if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); - } - if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); - } - log.info("Start harvesting using url: " + url); - - return downloadPage(url); - } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); - } - } - - private String extractResumptionToken(final String xml) { - - final String s = StringUtils.substringAfter(xml, "", " newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector); - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java deleted file mode 100644 index 3f5b245ef..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java +++ /dev/null @@ -1,24 +0,0 @@ -package eu.dnetlib.collector.worker.utils; - -import java.util.List; - -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - -import eu.dnetlib.collector.worker.plugins.CollectorPlugin; - -@Component -public class CollectorPluginEnumerator { - - @Autowired - private List plugins; - - public CollectorPlugin getPluginByProtocol(final String protocol) { - return plugins.stream() - .filter(p -> p.getClass().isAnnotationPresent(DnetWorkerCollector.class)) - .filter(p -> p.getClass().getAnnotation(DnetWorkerCollector.class).value().equalsIgnoreCase(protocol)) - .findFirst() - .get(); - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java deleted file mode 100644 index 062f6c7a8..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,19 +0,0 @@ -package eu.dnetlib.collector.worker.utils; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = new String(); - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java deleted file mode 100644 index 28891c84f..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java +++ /dev/null @@ -1,14 +0,0 @@ -package eu.dnetlib.collector.worker.utils; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface DnetWorkerCollector { - - String value(); - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java deleted file mode 100644 index 8a24381e5..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java +++ /dev/null @@ -1,226 +0,0 @@ -package eu.dnetlib.collector.worker.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.net.CookieHandler; -import java.net.CookieManager; -import java.net.CookiePolicy; -import java.net.HttpURLConnection; -import java.net.URL; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.springframework.stereotype.Component; - -import eu.dnetlib.collector.worker.DnetCollectorException; - -@Component -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl - * the URL - * @return the content of the downloaded resource - * @throws CollectorServiceException - * when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl - * the URL - * @return the content of the downloaded resource as InputStream - * @throws CollectorServiceException - * when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - - if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); } - } - throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { tm }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } - -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java deleted file mode 100644 index 7d1121a6d..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java +++ /dev/null @@ -1,259 +0,0 @@ -package eu.dnetlib.collector.worker.utils; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -/** - * @author jochen, Andreas Czerniak - * - */ -public class XmlCleaner { - - /** - * Pattern for numeric entities. - */ - private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ - // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ - - // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to - private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); - - /** - * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | - * [#x10000-#x10FFFF] - */ - private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ - - // Map entities to their unicode equivalent - private static Set goodEntities = new HashSet<>(); - private static Map badEntities = new HashMap<>(); - - static { - // pre-defined XML entities - goodEntities.add("""); //$NON-NLS-1$ // quotation mark - goodEntities.add("&"); //$NON-NLS-1$ // ampersand - goodEntities.add("<"); //$NON-NLS-1$ // less-than sign - goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign - // control entities - // badEntities.put(" ", ""); - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - // misc entities - badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro - badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark - badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark - // Latin 1 entities - badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space - badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark - badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign - badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign - badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign - badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign - badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar - badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign - badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis - badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign - badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator - badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark - badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign - badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen - badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign - badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron - badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign - badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign - badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two - badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three - badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent - badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign - badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign - badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot - badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla - badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one - badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator - badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark - badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter - badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half - badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters - badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark - badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave - badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute - badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex - badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde - badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis - badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above - badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE - badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla - badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave - badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute - badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex - badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis - badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave - badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute - badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex - badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis - badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH - badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde - badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave - badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute - badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex - badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde - badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis - badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign - badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke - badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave - badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute - badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex - badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis - badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute - badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN - badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s - badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave - badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute - badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex - badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde - badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis - badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above - badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae - badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla - badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave - badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute - badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex - badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis - badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave - badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute - badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex - badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis - badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth - badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde - badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave - badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute - badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex - badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde - badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis - badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign - badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke - badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave - badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute - badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex - badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis - badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute - badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn - badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis - } - - /** - * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each - * instance of a bare {@literal &}, replace it with {@literal &
- * } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}. - * - * @param broken - * the string to handle entities - * @return the string with entities appropriately fixed up - */ - static public String cleanAllEntities(final String broken) { - if (broken == null) { return null; } - - String working = invalidControlCharPattern.matcher(broken).replaceAll(""); - working = invalidCharacterPattern.matcher(working).replaceAll(""); - - int cleanfrom = 0; - - while (true) { - int amp = working.indexOf('&', cleanfrom); - // If there are no more amps then we are done - if (amp == -1) { - break; - } - // Skip references of the kind &#ddd; - if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { - cleanfrom = working.indexOf(';', amp) + 1; - continue; - } - int i = amp + 1; - while (true) { - // if we are at the end of the string then just escape the '&'; - if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ - } - // if we have come to a ; then we have an entity - // If it is something that xml can't handle then replace it. - final char c = working.charAt(i); - if (c == ';') { - final String entity = working.substring(amp, i + 1); - final String replace = handleEntity(entity); - working = working.substring(0, amp) + replace + working.substring(i + 1); - break; - } - // Did we end an entity without finding a closing ; - // Then treat it as an '&' that needs to be replaced with & - if (!Character.isLetterOrDigit(c)) { - working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ - amp = i + 4; // account for the 4 extra characters - break; - } - i++; - } - cleanfrom = amp + 1; - } - - if (Pattern.compile("<<").matcher(working).find()) { - working = working.replaceAll("<<", "<<"); - } - - if (Pattern.compile(">>").matcher(working).find()) { - working = working.replaceAll(">>", ">>"); - } - - return working; - } - - /** - * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities: - * &amp;, &quot;, &lt; and &gt;. - * - * @param entity - * the entity to be replaced - * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. - */ - private static String handleEntity(final String entity) { - if (goodEntities.contains(entity)) { return entity; } - - final String replace = badEntities.get(entity); - if (replace != null) { return replace; } - - return replace != null ? replace : ""; - } -} diff --git a/dhp-workflows/dhp-collector-worker/src/main/resources/application.properties b/dhp-workflows/dhp-collector-worker/src/main/resources/application.properties deleted file mode 100644 index 8b1378917..000000000 --- a/dhp-workflows/dhp-collector-worker/src/main/resources/application.properties +++ /dev/null @@ -1 +0,0 @@ - diff --git a/dhp-workflows/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java deleted file mode 100644 index 24f78f318..000000000 --- a/dhp-workflows/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java +++ /dev/null @@ -1,45 +0,0 @@ -package eu.dnetlib.collector.worker; - -import static org.junit.Assert.assertNotNull; - -import org.junit.Test; -import org.junit.runner.RunWith; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.CommandLineRunner; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.context.ApplicationContext; -import org.springframework.test.context.junit4.SpringRunner; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; - -@RunWith(SpringRunner.class) -@SpringBootTest -public class DnetCollectorWorkerApplicationTests { - - @Autowired - private ApplicationContext ctx; - - @Test - public void testFindPlugin() throws Exception { - final CollectorPluginEnumerator collectorPluginEnumerator = ctx.getBean(CollectorPluginEnumerator.class); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); - } - - @Test - public void testCollectionOAI() throws Exception { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - - ObjectMapper mapper = new ObjectMapper(); - - System.out.println(mapper.writeValueAsString(api)); - } - -} diff --git a/dhp-workflows/dhp-collector-worker/target/classes/application.properties b/dhp-workflows/dhp-collector-worker/target/classes/application.properties deleted file mode 100644 index 8b1378917..000000000 --- a/dhp-workflows/dhp-collector-worker/target/classes/application.properties +++ /dev/null @@ -1 +0,0 @@ -