From ded6aef5e1eb2406ed74c88e4d0ac07411ad75a5 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 3 Apr 2019 16:05:16 +0200 Subject: [PATCH] moved collector worker --- .../dhp-collector-worker/README.md | 16 ++ dhp-applications/dhp-collector-worker/pom.xml | 85 ++++++ .../worker/DnetCollectorException.java | 30 ++ .../DnetCollectorWorkerApplication.java | 121 ++++++++ .../collector/worker/model/ApiDescriptor.java | 50 ++++ .../worker/plugins/CollectorPlugin.java | 11 + .../plugins/oai/OaiCollectorPlugin.java | 66 +++++ .../worker/plugins/oai/OaiIterator.java | 163 +++++++++++ .../plugins/oai/OaiIteratorFactory.java | 20 ++ .../utils/CollectorPluginEnumerator.java | 24 ++ .../utils/CollectorPluginErrorLogList.java | 19 ++ .../worker/utils/DnetWorkerCollector.java | 14 + .../collector/worker/utils/HttpConnector.java | 226 +++++++++++++++ .../collector/worker/utils/XmlCleaner.java | 259 ++++++++++++++++++ .../src/main/resources/application.properties | 1 + .../DnetCollectorWorkerApplicationTests.java | 45 +++ .../dhp-mdstore-manager-app/pom.xml | 39 +-- .../src/main/resources/application.properties | 4 +- dhp-applications/pom.xml | 35 +++ dhp-common/pom.xml | 1 + .../dhp/model/mdstore/MetadataRecord.java | 105 +++++++ .../dnetlib/dhp/model/mdstore/Provenance.java | 56 ++++ .../java/eu/dnetlib/dhp/utils/DHPUtils.java | 25 ++ .../dhp/model/mdstore/MetadataRecordTest.java | 15 + dhp-schemas/pom.xml | 1 + .../GenerateNativeStoreSparkJob.java | 6 +- .../dhp/collection/TransformSparkJobNode.java | 47 ++++ .../dhp/transformation/TransformFunction.java | 12 +- .../workflows/oozie_collection_workflows.xml | 2 +- .../dhp/collection/CollectionJobTest.java | 21 +- .../target/maven-archiver/pom.properties | 4 - dhp-workflows/pom.xml | 2 +- pom.xml | 21 +- 33 files changed, 1486 insertions(+), 60 deletions(-) create mode 100644 dhp-applications/dhp-collector-worker/README.md create mode 100644 dhp-applications/dhp-collector-worker/pom.xml create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIteratorFactory.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java create mode 100644 dhp-applications/dhp-collector-worker/src/main/resources/application.properties create mode 100644 dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java delete mode 100644 dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties diff --git a/dhp-applications/dhp-collector-worker/README.md b/dhp-applications/dhp-collector-worker/README.md new file mode 100644 index 000000000..b6609179d --- /dev/null +++ b/dhp-applications/dhp-collector-worker/README.md @@ -0,0 +1,16 @@ +Description of the Module +-------------------------- +This module defines a **collector worker application** that run on Hadoop. + +It is responsible of harvesting metadata using different plugin, that should be passed as arguments +in the main class + +##Plugins +* OAI Plugin + + +## Api Descriptor +TODO + +## Usage +TODO \ No newline at end of file diff --git a/dhp-applications/dhp-collector-worker/pom.xml b/dhp-applications/dhp-collector-worker/pom.xml new file mode 100644 index 000000000..25d470020 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/pom.xml @@ -0,0 +1,85 @@ + + + + + eu.dnetlib.dhp + dhp-applications + 1.0.0-SNAPSHOT + ../ + + + 4.0.0 + + eu.dnetlib + dhp-collector-worker + 1.0.0 + + + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + + + + org.springframework.boot + spring-boot-starter + + + org.apache.hadoop + hadoop-client + 2.6.0-cdh5.9.2 + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + dom4j + dom4j + + + jaxen + jaxen + + + org.springframework.boot + spring-boot-starter-test + test + + + + + + + + UTF-8 + UTF-8 + cdh5.9.2 + 2.6.0-${dhp.cdh.version} + 2.2.0 + 2.11.8 + + + + \ No newline at end of file diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java new file mode 100644 index 000000000..bc4287a0d --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorException.java @@ -0,0 +1,30 @@ +package eu.dnetlib.collector.worker; + +public class DnetCollectorException extends Exception { + + /** + * + */ + private static final long serialVersionUID = -290723075076039757L; + + public DnetCollectorException() { + super(); + } + + public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public DnetCollectorException(final String message, final Throwable cause) { + super(message, cause); + } + + public DnetCollectorException(final String message) { + super(message); + } + + public DnetCollectorException(final Throwable cause) { + super(cause); + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java new file mode 100644 index 000000000..c2854b8f2 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplication.java @@ -0,0 +1,121 @@ +package eu.dnetlib.collector.worker; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; +import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.CommandLineRunner; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +import java.io.IOException; +import java.net.URI; +import java.util.concurrent.atomic.AtomicInteger; + + +/** + * + * DnetCollectortWorkerApplication is the main class responsible to start + * the Dnet Collection into HDFS. + * This module will be executed on the hadoop cluster and taking in input some parameters + * that tells it which is the right collector plugin to use and where store the data into HDFS path + * + * + * @author Sandro La Bruzzo + */ +@SpringBootApplication +public class DnetCollectorWorkerApplication implements CommandLineRunner { + + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); + + @Autowired + private CollectorPluginEnumerator collectorPluginEnumerator; + + /** + * + * @param args + */ + public static void main(final String[] args) { + SpringApplication.run(DnetCollectorWorkerApplication.class, args); + } + + /** + * This module expect two arguments: + * path hdfs where store the sequential file. + * Json serialization of {@link ApiDescriptor} + */ + @Override + public void run(final String... args) throws Exception { + if (args.length == 0) { return; } + if (args.length != 3) { throw new DnetCollectorException("Invalid number of parameters, expected: hdfs_path and json_api_description"); } + + //TODO : migrate to https://commons.apache.org/proper/commons-cli/usage.html + + + + + final String hdfsPath = args[0]; + + log.info("hdfsPath ="+hdfsPath); + + final String json = args[1]; + + + final String nameNode = args[2]; + + log.info("json = "+json); + final ObjectMapper jsonMapper = new ObjectMapper(); + final ApiDescriptor api = jsonMapper.readValue(json, ApiDescriptor.class); + + final CollectorPlugin plugin = collectorPluginEnumerator.getPluginByProtocol(api.getProtocol()); + + final String hdfsuri =nameNode; + + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("HADOOP_USER_NAME", "sandro.labruzzo"); + System.setProperty("hadoop.home.dir", "/"); + //Get the filesystem - HDFS + FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(hdfsPath); + + log.info("Created path "+hdfswritepath.toString()); + + try(SequenceFile.Writer writer = SequenceFile.createWriter(conf, + SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + + final AtomicInteger counter = new AtomicInteger(0); + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + + plugin.collect(api).forEach(content -> { + + key.set(counter.getAndIncrement()); + value.set(content); + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + + }); + } + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java new file mode 100644 index 000000000..6874d97f4 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java @@ -0,0 +1,50 @@ +package eu.dnetlib.collector.worker.model; + +import java.util.HashMap; +import java.util.Map; + + +//TODO sholud be moved on dhp-common +public class ApiDescriptor { + + private String id; + + private String baseUrl; + + private String protocol; + + private Map params = new HashMap<>(); + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Map getParams() { + return params; + } + + public void setParams(final HashMap params) { + this.params = params; + } + + public String getProtocol() { + return protocol; + } + + public void setProtocol(final String protocol) { + this.protocol = protocol; + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java new file mode 100644 index 000000000..5ec1e9a6e --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/CollectorPlugin.java @@ -0,0 +1,11 @@ +package eu.dnetlib.collector.worker.plugins; + +import java.util.stream.Stream; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.model.ApiDescriptor; + +public interface CollectorPlugin { + + Stream collect(ApiDescriptor api) throws DnetCollectorException; +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java new file mode 100644 index 000000000..7c4bec192 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiCollectorPlugin.java @@ -0,0 +1,66 @@ +package eu.dnetlib.collector.worker.plugins.oai; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; +import eu.dnetlib.collector.worker.utils.DnetWorkerCollector; + +@Component +@DnetWorkerCollector("oai") +public class OaiCollectorPlugin implements CollectorPlugin { + + private static final String FORMAT_PARAM = "format"; + private static final String OAI_SET_PARAM = "set"; + private static final Object OAI_FROM_DATE_PARAM = "fromDate"; + private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; + + @Autowired + private OaiIteratorFactory oaiIteratorFactory; + + @Override + public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + final String baseUrl = api.getBaseUrl(); + final String mdFormat = api.getParams().get(FORMAT_PARAM); + final String setParam = api.getParams().get(OAI_SET_PARAM); + final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); + final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); + + final List sets = new ArrayList<>(); + if (setParam != null) { + sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); + } + if (sets.isEmpty()) { + // If no set is defined, ALL the sets must be harvested + sets.add(""); + } + + if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); } + + if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); } + + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } + + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } + + final Iterator> iters = sets.stream() + .map(set -> oaiIteratorFactory.newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .iterator(); + + return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); + } +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java new file mode 100644 index 000000000..191b7b596 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/plugins/oai/OaiIterator.java @@ -0,0 +1,163 @@ +package eu.dnetlib.collector.worker.plugins.oai; + +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.Iterator; +import java.util.Queue; +import java.util.concurrent.PriorityBlockingQueue; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import eu.dnetlib.collector.worker.DnetCollectorException; +import eu.dnetlib.collector.worker.utils.HttpConnector; +import eu.dnetlib.collector.worker.utils.XmlCleaner; + +public class OaiIterator implements Iterator { + + private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM + + private final Queue queue = new PriorityBlockingQueue<>(); + private final SAXReader reader = new SAXReader(); + + private final String baseUrl; + private final String set; + private final String mdFormat; + private final String fromDate; + private final String untilDate; + private String token; + private boolean started; + private final HttpConnector httpConnector; + + public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, + final HttpConnector httpConnector) { + this.baseUrl = baseUrl; + this.mdFormat = mdFormat; + this.set = set; + this.fromDate = fromDate; + this.untilDate = untilDate; + this.started = false; + this.httpConnector = httpConnector; + } + + private void verifyStarted() { + if (!this.started) { + this.started = true; + try { + this.token = firstPage(); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public boolean hasNext() { + synchronized (queue) { + verifyStarted(); + return !queue.isEmpty(); + } + } + + @Override + public String next() { + synchronized (queue) { + verifyStarted(); + final String res = queue.poll(); + while (queue.isEmpty() && token != null && !token.isEmpty()) { + try { + token = otherPages(token); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + return res; + } + } + + @Override + public void remove() {} + + private String firstPage() throws DnetCollectorException { + try { + String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); + if (set != null && !set.isEmpty()) { + url += "&set=" + URLEncoder.encode(set, "UTF-8"); + } + if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); + } + if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); + } + log.info("Start harvesting using url: " + url); + + return downloadPage(url); + } catch (final UnsupportedEncodingException e) { + throw new DnetCollectorException(e); + } + } + + private String extractResumptionToken(final String xml) { + + final String s = StringUtils.substringAfter(xml, "", " newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, httpConnector); + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java new file mode 100644 index 000000000..3f5b245ef --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginEnumerator.java @@ -0,0 +1,24 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.List; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import eu.dnetlib.collector.worker.plugins.CollectorPlugin; + +@Component +public class CollectorPluginEnumerator { + + @Autowired + private List plugins; + + public CollectorPlugin getPluginByProtocol(final String protocol) { + return plugins.stream() + .filter(p -> p.getClass().isAnnotationPresent(DnetWorkerCollector.class)) + .filter(p -> p.getClass().getAnnotation(DnetWorkerCollector.class).value().equalsIgnoreCase(protocol)) + .findFirst() + .get(); + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java new file mode 100644 index 000000000..062f6c7a8 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/CollectorPluginErrorLogList.java @@ -0,0 +1,19 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.LinkedList; + +public class CollectorPluginErrorLogList extends LinkedList { + + private static final long serialVersionUID = -6925786561303289704L; + + @Override + public String toString() { + String log = new String(); + int index = 0; + for (final String errorMessage : this) { + log += String.format("Retry #%s: %s / ", index++, errorMessage); + } + return log; + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java new file mode 100644 index 000000000..28891c84f --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/DnetWorkerCollector.java @@ -0,0 +1,14 @@ +package eu.dnetlib.collector.worker.utils; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DnetWorkerCollector { + + String value(); + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java new file mode 100644 index 000000000..8a24381e5 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/HttpConnector.java @@ -0,0 +1,226 @@ +package eu.dnetlib.collector.worker.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.net.CookieHandler; +import java.net.CookieManager; +import java.net.CookiePolicy; +import java.net.HttpURLConnection; +import java.net.URL; +import java.security.GeneralSecurityException; +import java.security.cert.X509Certificate; +import java.util.List; +import java.util.Map; + +import javax.net.ssl.HttpsURLConnection; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.NumberUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.springframework.stereotype.Component; + +import eu.dnetlib.collector.worker.DnetCollectorException; + +@Component +public class HttpConnector { + + private static final Log log = LogFactory.getLog(HttpConnector.class); + + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds + + private String responseType = null; + + private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + public HttpConnector() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl + * the URL + * @return the content of the downloaded resource + * @throws CollectorServiceException + * when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl) throws DnetCollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + /** + * Given the URL returns the content as a stream via HTTP GET + * + * @param requestUrl + * the URL + * @return the content of the downloaded resource as InputStream + * @throws CollectorServiceException + * when retrying more than maxNumberOfRetry times + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { + return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + try { + final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + + if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); } + + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; + + try { + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { + log.warn("waiting and repeating request after " + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + errorList.add("503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug("The requested url has been moved to " + newUrl); + errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); + urlConn.disconnect(); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { + log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + Thread.sleep(defaultDelay * 1000); + errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); + + for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (final String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0 + && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws DnetCollectorException { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); } + } + throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + /** + * register for https scheme; this is a workaround and not intended for the use in trusted environments + */ + public void initTrustManager() { + final X509TrustManager tm = new X509TrustManager() { + + @Override + public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} + + @Override + public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] { tm }, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (final GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getDefaultDelay() { + return defaultDelay; + } + + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } + + public String getResponseType() { + return responseType; + } + +} diff --git a/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java new file mode 100644 index 000000000..7d1121a6d --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/java/eu/dnetlib/collector/worker/utils/XmlCleaner.java @@ -0,0 +1,259 @@ +package eu.dnetlib.collector.worker.utils; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * @author jochen, Andreas Czerniak + * + */ +public class XmlCleaner { + + /** + * Pattern for numeric entities. + */ + private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ + + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | + * [#x10000-#x10FFFF] + */ + private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ + + // Map entities to their unicode equivalent + private static Set goodEntities = new HashSet<>(); + private static Map badEntities = new HashMap<>(); + + static { + // pre-defined XML entities + goodEntities.add("""); //$NON-NLS-1$ // quotation mark + goodEntities.add("&"); //$NON-NLS-1$ // ampersand + goodEntities.add("<"); //$NON-NLS-1$ // less-than sign + goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark + badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark + badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator + badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark + badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator + badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark + badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter + badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half + badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters + badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark + badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave + badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute + badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex + badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde + badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis + badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above + badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE + badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla + badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave + badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute + badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex + badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis + badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave + badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute + badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex + badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis + badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde + badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave + badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute + badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex + badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde + badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis + badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke + badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave + badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute + badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex + badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis + badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute + badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN + badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s + badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave + badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute + badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex + badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde + badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis + badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above + badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla + badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave + badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute + badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex + badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis + badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave + badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute + badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex + badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis + badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde + badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave + badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute + badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex + badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde + badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis + badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke + badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave + badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute + badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex + badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis + badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute + badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn + badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis + } + + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each + * instance of a bare {@literal &}, replace it with {@literal &
+ * } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}. + * + * @param broken + * the string to handle entities + * @return the string with entities appropriately fixed up + */ + static public String cleanAllEntities(final String broken) { + if (broken == null) { return null; } + + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); + + int cleanfrom = 0; + + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } + + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } + + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } + + return working; + } + + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities: + * &amp;, &quot;, &lt; and &gt;. + * + * @param entity + * the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { return entity; } + + final String replace = badEntities.get(entity); + if (replace != null) { return replace; } + + return replace != null ? replace : ""; + } +} diff --git a/dhp-applications/dhp-collector-worker/src/main/resources/application.properties b/dhp-applications/dhp-collector-worker/src/main/resources/application.properties new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/main/resources/application.properties @@ -0,0 +1 @@ + diff --git a/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java new file mode 100644 index 000000000..24f78f318 --- /dev/null +++ b/dhp-applications/dhp-collector-worker/src/test/java/eu/dnetlib/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -0,0 +1,45 @@ +package eu.dnetlib.collector.worker; + +import static org.junit.Assert.assertNotNull; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.CommandLineRunner; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.ApplicationContext; +import org.springframework.test.context.junit4.SpringRunner; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.collector.worker.utils.CollectorPluginEnumerator; + +@RunWith(SpringRunner.class) +@SpringBootTest +public class DnetCollectorWorkerApplicationTests { + + @Autowired + private ApplicationContext ctx; + + @Test + public void testFindPlugin() throws Exception { + final CollectorPluginEnumerator collectorPluginEnumerator = ctx.getBean(CollectorPluginEnumerator.class); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); + } + + @Test + public void testCollectionOAI() throws Exception { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + + ObjectMapper mapper = new ObjectMapper(); + + System.out.println(mapper.writeValueAsString(api)); + } + +} diff --git a/dhp-applications/dhp-mdstore-manager-app/pom.xml b/dhp-applications/dhp-mdstore-manager-app/pom.xml index 292a3ca34..db3d702ce 100644 --- a/dhp-applications/dhp-mdstore-manager-app/pom.xml +++ b/dhp-applications/dhp-mdstore-manager-app/pom.xml @@ -3,30 +3,31 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - eu.dnetlib.dhp dhp-mdstore-manager-app 1.1.0-SNAPSHOT - org.springframework.boot - spring-boot-starter-parent - 2.1.3.RELEASE - + eu.dnetlib.dhp + dhp-applications + 1.0.0-SNAPSHOT + ../ - + + @@ -84,16 +85,6 @@ - - - - org.springframework.boot - spring-boot-maven-plugin - - true - - - - + diff --git a/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties b/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties index 792c39865..46f38ca64 100644 --- a/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties +++ b/dhp-applications/dhp-mdstore-manager-app/src/main/resources/application.properties @@ -2,8 +2,8 @@ spring.main.banner-mode = console logging.level.root = INFO spring.datasource.url=jdbc:postgresql://localhost:5432/mdstoremanager -spring.datasource.username= -spring.datasource.password= +spring.datasource.username=dnet +spring.datasource.password=dnetPwd spring.jpa.properties.hibernate.dialect = org.hibernate.dialect.PostgreSQLDialect diff --git a/dhp-applications/pom.xml b/dhp-applications/pom.xml index 51d4047ac..7e52fc76d 100644 --- a/dhp-applications/pom.xml +++ b/dhp-applications/pom.xml @@ -5,11 +5,46 @@ eu.dnetlib.dhp dhp 1.0.0-SNAPSHOT + ../ dhp-applications pom dhp-mdstore-manager-app + dhp-collector-worker + + + + + org.springframework.boot + spring-boot-dependencies + 2.1.3.RELEASE + pom + import + + + + + + + eu.dnetlib.dhp + dhp-common + 1.0.0-SNAPSHOT + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + true + + + + + diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 3b7b6aca3..2de3dc310 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -8,6 +8,7 @@ eu.dnetlib.dhp dhp 1.0.0-SNAPSHOT + ../ dhp-common diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java new file mode 100644 index 000000000..3be1e1386 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java @@ -0,0 +1,105 @@ +package eu.dnetlib.dhp.model.mdstore; + +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.lang3.StringUtils; + +import java.io.Serializable; + + +/** + * This class models a record inside the new MetadataStore + * + */ +public class MetadataRecord implements Serializable { + + private String id; + + private String originalId; + + private String encoding; + + private Provenance provenance; + + private String body; + + private long dateOfCollection; + + + public MetadataRecord() { + this.dateOfCollection = System.currentTimeMillis(); + } + + public MetadataRecord(String originalId, String encoding, Provenance provenance, String body, long dateOfCollection) { + + this.originalId = originalId; + this.encoding = encoding; + this.provenance = provenance; + this.body = body; + this.dateOfCollection = dateOfCollection; + this.id = DHPUtils.generateIdentifier(originalId,this.provenance.getNsPrefix()); + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + + public String getOriginalId() { + return originalId; + } + + public void setOriginalId(String originalId) { + this.originalId = originalId; + } + + public String getEncoding() { + return encoding; + } + + public void setEncoding(String encoding) { + this.encoding = encoding; + } + + public Provenance getProvenance() { + return provenance; + } + + public void setProvenance(Provenance provenance) { + this.provenance = provenance; + } + + + public String getBody() { + return body; + } + + public void setBody(String body) { + this.body = body; + } + + public long getDateOfCollection() { + return dateOfCollection; + } + + public void setDateOfCollection(long dateOfCollection) { + this.dateOfCollection = dateOfCollection; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof MetadataRecord)) { + return false; + } + return ((MetadataRecord) o).getId().equalsIgnoreCase(id); + + } + + @Override + public int hashCode() { + return id.hashCode(); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java new file mode 100644 index 000000000..de67281f2 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java @@ -0,0 +1,56 @@ +package eu.dnetlib.dhp.model.mdstore; + +import java.io.Serializable; + + +/** + * @author Sandro La Bruzzo + * + * Provenace class models the provenance of the record in the metadataStore + * It contains the identifier and the name of the datasource that gives the + * record + * + */ +public class Provenance implements Serializable { + + private String datasourceId; + + + private String datasourceName; + + private String nsPrefix; + + public Provenance() { + + } + + public Provenance(String datasourceId, String datasourceName, String nsPrefix) { + this.datasourceId = datasourceId; + this.datasourceName = datasourceName; + this.nsPrefix = nsPrefix; + } + + public String getDatasourceId() { + return datasourceId; + } + + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } + + public String getDatasourceName() { + return datasourceName; + } + + public void setDatasourceName(String datasourceName) { + this.datasourceName = datasourceName; + } + + public String getNsPrefix() { + return nsPrefix; + } + + public void setNsPrefix(String nsPrefix) { + this.nsPrefix = nsPrefix; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java new file mode 100644 index 000000000..d37272d21 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -0,0 +1,25 @@ +package eu.dnetlib.dhp.utils; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; + +import java.security.MessageDigest; + +public class DHPUtils { + + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } + + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s",nsPrefix, DHPUtils.md5(originalId)); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java new file mode 100644 index 000000000..4515429ea --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.model.mdstore; + +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +public class MetadataRecordTest { + + @Test + public void getTimestamp() { + + MetadataRecord r = new MetadataRecord(); + assertTrue(r.getDateOfCollection() >0); + } +} \ No newline at end of file diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 6ead99f42..5c5b804a7 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -8,6 +8,7 @@ eu.dnetlib.dhp dhp 1.0.0-SNAPSHOT + ../ dhp-schemas diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index d49846b23..719e1f134 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -50,10 +50,6 @@ public class GenerateNativeStoreSparkJob { } } - - - - public static void main(String[] args) throws Exception { Options options = new Options(); @@ -113,7 +109,7 @@ public class GenerateNativeStoreSparkJob { final SparkSession spark = SparkSession .builder() .appName("GenerateNativeStoreSparkJob") - .master("yarn") + .master("local") .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java index 2e1d53aa4..926b13bba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/TransformSparkJobNode.java @@ -1,4 +1,51 @@ package eu.dnetlib.dhp.collection; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.TransformFunction; +import org.apache.commons.cli.*; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; + public class TransformSparkJobNode { + + public static void main(String[] args) throws ParseException { + Options options = new Options(); + + options.addOption(Option.builder("i") + .longOpt("input") + .required(true) + .desc("input path of the sequence file") + .hasArg() // This option has an argument. + .build()); + options.addOption(Option.builder("o") + .longOpt("output") + .required(true) + .desc("output path of the mdstore") + .hasArg() + .build()); + + final CommandLineParser parser = new DefaultParser(); + final CommandLine cmd = parser.parse( options, args); + + final String inputPath = cmd.getOptionValue("i"); + final String outputPath = cmd.getOptionValue("o"); + + final SparkSession spark = SparkSession + .builder() + .appName("GenerateNativeStoreSparkJob") + .master("local") + .getOrCreate(); + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); + final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); + + final TransformFunction mfunc = new TransformFunction(totalItems); + mdstoreInput.map(mfunc, encoder).write().format("parquet").save(outputPath); + System.out.println("totalItems = " + totalItems.value()); + + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java index 393db48ba..2380c673a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java @@ -1,13 +1,21 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.transformation; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.util.LongAccumulator; public class TransformFunction implements MapFunction { + private final LongAccumulator totalItems; + + public TransformFunction(LongAccumulator totalItems) { + this.totalItems= totalItems; + } + @Override public MetadataRecord call(MetadataRecord value) throws Exception { - return null; + totalItems.add(1); + return value; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml index 0deda423b..c6279e7c5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/oozie/workflows/oozie_collection_workflows.xml @@ -20,7 +20,7 @@ A json encoding of the Datasource Info - identifierPath + identifierXPath An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index 61bb472ce..3bed60586 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -15,13 +15,23 @@ public class CollectionJobTest { @Test public void test () throws Exception { Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - - -// GenerateNativeStoreSparkJob.main(new String[] {"XML", ""+System.currentTimeMillis(), new ObjectMapper().writeValueAsString(provenance), "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']","/home/sandro/Downloads/mdstore_oai","/home/sandro/Downloads/mdstore_result"}); + GenerateNativeStoreSparkJob.main(new String[] {"-e", "XML","-d", ""+System.currentTimeMillis(),"-p", new ObjectMapper().writeValueAsString(provenance), "-x","./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']","-i","/home/sandro/Downloads/oai_1","-o","/home/sandro/Downloads/mdstore_result"}); System.out.println(new ObjectMapper().writeValueAsString(provenance)); } + @Test + public void transformTest () throws Exception { + + TransformSparkJobNode.main(new String[]{"-o","/home/sandro/Downloads/mdstore_cleande","-i","/home/sandro/Downloads/mdstore_result"}); + + + + + } + + + @Test public void testGenerationMetadataRecord() throws Exception { @@ -29,6 +39,7 @@ public class CollectionJobTest { MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); + assert record != null; System.out.println(record.getId()); System.out.println(record.getOriginalId()); @@ -42,9 +53,11 @@ public class CollectionJobTest { final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); + assert record != null; record.setBody("ciao"); + assert record1 != null; record1.setBody("mondo"); - Assert.assertTrue(record.equals(record1)); + Assert.assertEquals(record, record1); } diff --git a/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties b/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties deleted file mode 100644 index ef502424b..000000000 --- a/dhp-workflows/dhp-collector-worker/target/maven-archiver/pom.properties +++ /dev/null @@ -1,4 +0,0 @@ -#Created by Apache Maven 3.6.0 -version=1.0.0 -groupId=eu.dnetlib -artifactId=dhp-collector-worker diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index f892307aa..f8fcde2a8 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -5,11 +5,11 @@ eu.dnetlib.dhp dhp 1.0.0-SNAPSHOT + ../ dhp-workflows pom - dhp-collector-worker dhp-aggregation diff --git a/pom.xml b/pom.xml index fde394c83..11544cc60 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ dhp-common dhp-workflows - dhp-applications + dhp-applications @@ -37,9 +37,9 @@ - scm:git:ssh://git@github.com/??? - scm:git:ssh://git@github.com/???.git - https://github.com/??? + scm:git:ssh://git@github.com/dnet-team/dnet-hadoop.git + scm:git:ssh://git@github.com:dnet-team/dnet-hadoop.git + https://github.com/dnet-team/dnet-hadoop HEAD @@ -135,22 +135,16 @@ commons-cli 1.4 - - - - target target/classes ${project.artifactId}-${project.version} target/test-classes - - org.apache.maven.plugins maven-compiler-plugin @@ -191,7 +185,6 @@ true - org.apache.maven.plugins maven-javadoc-plugin @@ -200,27 +193,21 @@ true - org.apache.maven.plugins maven-dependency-plugin 3.0.0 - - - - org.apache.maven.plugins maven-release-plugin 2.5.3 -