From cda210a2ca993a20507108eb94970d89e05da3af Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 25 Jan 2021 14:17:42 +0100 Subject: [PATCH 01/86] changed documentation since it didn't reflect the current status --- dhp-workflows/dhp-aggregation/README.md | 31 +++++++------------------ 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/README.md b/dhp-workflows/dhp-aggregation/README.md index 02583b443..e46fdeb16 100644 --- a/dhp-workflows/dhp-aggregation/README.md +++ b/dhp-workflows/dhp-aggregation/README.md @@ -2,28 +2,15 @@ Description of the Module -------------------------- This module defines a **collector worker application** that runs on Hadoop. -It is responsible for harvesting metadata using different plugins. +It is responsible for harvesting metadata using different collector plugins and transformation into the common metadata model. -The collector worker uses a message queue to inform the progress -of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore, -It gives, at the end of the job, some information about the status -of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages). - -To work the collection worker need some parameter like: - -* **hdfsPath**: the path where storing the sequential file -* **apidescriptor**: the JSON encoding of the API Descriptor -* **namenode**: the Name Node URI -* **userHDFS**: the user wich create the hdfs seq file -* **rabbitUser**: the user to connect with RabbitMq for messaging -* **rabbitPassWord**: the password to connect with RabbitMq for messaging -* **rabbitHost**: the host of the RabbitMq server -* **rabbitOngoingQueue**: the name of the ongoing queue -* **rabbitReportQueue**: the name of the report queue -* **workflowId**: the identifier of the dnet Workflow - -##Plugins +# Collector Plugins * OAI Plugin -## Usage -TODO \ No newline at end of file +# Transformation Plugins +TODO + + +# Usage +TODO + From ffb092b8d3edf5e3f451c1499de073b6ca341efc Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 25 Jan 2021 15:05:37 +0100 Subject: [PATCH 02/86] removed duplicate code HttpConnector.java --- .../CollectorPluginErrorLogList.java | 20 -- .../CollectorServiceException.java | 20 -- .../project/httpconnector/HttpConnector.java | 240 ------------------ .../actionmanager/project/utils/ReadCSV.java | 2 +- .../project/utils/ReadExcel.java | 3 +- .../GenerateNativeStoreSparkJob.java | 198 ++++++++------- .../project/EXCELParserTest.java | 7 +- .../httpconnector/HttpConnectorTest.java | 6 +- .../eu/dnetlib/dhp/transform/ext_simple.xsl | 4 +- 9 files changed, 113 insertions(+), 387 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java deleted file mode 100644 index 9d3f88265..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = new String(); - int index = 0; - for (String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java deleted file mode 100644 index 9167d97b4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -public class CollectorServiceException extends Exception { - - private static final long serialVersionUID = 7523999812098059764L; - - public CollectorServiceException(String string) { - super(string); - } - - public CollectorServiceException(String string, Throwable exception) { - super(string, exception); - } - - public CollectorServiceException(Throwable exception) { - super(exception); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java deleted file mode 100644 index e20518b55..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java +++ /dev/null @@ -1,240 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * @author jochen, michele, andrea - */ -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws CollectorServiceException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - try { - InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private InputStream attemptDownload(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - - if (retryNumber > maxNumberOfRetry) { - throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) - || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), - newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) - && NumberUtils.isCreatable(headerMap.get(key).get(0))) { - return Integer - .parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) throws CollectorServiceException { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { - return headerMap.get(key).get(0); - } - } - throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 9dac34a15..dc6f46771 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; /** diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 23b58f2a0..e665bc704 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project.utils; import java.io.*; import java.nio.charset.StandardCharsets; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -14,7 +15,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; /** diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 861ae5201..c0bd4c940 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -43,6 +43,106 @@ public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + parser.parseArgument(args); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final Map ongoingMap = new HashMap<>(); + final Map reportMap = new HashMap<>(); + + final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + final JavaPairRDD inputRDD = sc + .sequenceFile(parser.get("input"), IntWritable.class, Text.class); + + final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); + final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + + final MessageManager manager = new MessageManager( + parser.get("rabbitHost"), + parser.get("rabbitUser"), + parser.get("rabbitPassword"), + false, + false, + null); + + final JavaRDD mappeRDD = inputRDD + .map( + item -> parseRecord( + item._2().toString(), + parser.get("xpath"), + parser.get("encoding"), + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); + + ongoingMap.put("ongoing", "0"); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); + final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); + mdStoreRecords.add(mdstore.count()); + ongoingMap.put("ongoing", "" + totalItems.value()); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } + mdstore.write().format("parquet").save(parser.get("output")); + reportMap.put("inputItem", "" + totalItems.value()); + reportMap.put("invalidRecords", "" + invalidRecords.value()); + reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); + if (!test) { + manager + .sendMessage( + new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + parser.get("rabbitReportQueue"), + true, + false); + manager.close(); + } + }); + + } + public static MetadataRecord parseRecord( final String input, final String xpath, @@ -73,103 +173,5 @@ public class GenerateNativeStoreSparkJob { } } - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateNativeStoreSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); - parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - final JavaPairRDD inputRDD = sc - .sequenceFile(parser.get("input"), IntWritable.class, Text.class); - - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - - final MessageManager manager = new MessageManager( - parser.get("rabbitHost"), - parser.get("rabbitUser"), - parser.get("rabbitPassword"), - false, - false, - null); - - final JavaRDD mappeRDD = inputRDD - .map( - item -> parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); - - ongoingMap.put("ongoing", "0"); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "" + totalItems.value()); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + invalidRecords.value()); - reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); - if (!test) { - manager - .sendMessage( - new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - parser.get("rabbitReportQueue"), - true, - false); - manager.close(); - } - }); - - } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 59b536cd5..c1142ad9c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -6,14 +6,15 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; + import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; @Disabled @@ -30,7 +31,7 @@ public class EXCELParserTest { } @Test - public void test1() throws CollectorServiceException, IOException, InvalidFormatException, ClassNotFoundException, + public void test1() throws DnetCollectorException, IOException, InvalidFormatException, ClassNotFoundException, IllegalAccessException, InstantiationException { EXCELParser excelParser = new EXCELParser(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java index 90b3919ed..3b9d1c3ab 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.actionmanager.project.httpconnector; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; @@ -29,12 +31,12 @@ public class HttpConnectorTest { @Test - public void testGetInputSource() throws CollectorServiceException { + public void testGetInputSource() throws DnetCollectorException { System.out.println(connector.getInputSource(URL)); } @Test - public void testGoodServers() throws CollectorServiceException { + public void testGoodServers() throws DnetCollectorException { System.out.println(connector.getInputSource(URL_GOODSNI_SERVER)); } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index cef50aa95..f22db961b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -1,7 +1,7 @@ @@ -9,7 +9,7 @@ - + From a54848a59c2b4971569f2036ef2851f87c578701 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 25 Jan 2021 15:43:04 +0100 Subject: [PATCH 03/86] Moved Vocabulary stuff to common module --- .../java/eu/dnetlib/dhp/common/vocabulary}/Vocabulary.java | 2 +- .../eu/dnetlib/dhp/common/vocabulary}/VocabularyGroup.java | 2 +- .../eu/dnetlib/dhp/common/vocabulary}/VocabularyTerm.java | 2 +- .../eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java | 2 +- .../java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java | 2 +- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- .../dhp/oa/graph/raw/GenerateEntitiesApplication.java | 5 +---- .../dhp/oa/graph/raw/MigrateDbEntitiesApplication.java | 2 +- .../java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java | 2 +- .../java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java | 2 +- .../eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java | 2 +- .../dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java | 2 +- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- .../dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java | 2 +- 14 files changed, 14 insertions(+), 17 deletions(-) rename {dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common => dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary}/Vocabulary.java (98%) rename {dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common => dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary}/VocabularyGroup.java (99%) rename {dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common => dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary}/VocabularyTerm.java (88%) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java similarity index 98% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index bfc4fd6f1..1e333e93f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; import java.util.HashMap; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java similarity index 99% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index 32452bdc5..fac55189b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; import java.util.*; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java similarity index 88% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java index 1aa1b8253..52eb7ca23 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyTerm.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common.vocabulary; import java.io.Serializable; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 8231dd77e..31fe116e3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -21,7 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.utils.ISLookupClientFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index d2d4e118f..e0cb354d4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -7,7 +7,7 @@ import java.util.HashMap; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.Qualifier; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index cccf15398..079984a81 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -38,7 +38,7 @@ import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index cfd190670..a2db4a506 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -12,8 +11,6 @@ import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -27,7 +24,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index 3adbd244c..db1a2ef57 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -57,7 +57,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.VerifyNsPrefixPredicate; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index e62bc0790..4a8b24cf2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -18,7 +18,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 6d2e28ba8..bc47e778f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -19,7 +19,7 @@ import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index cb34b0cb3..9e161da6e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -18,7 +18,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java index 705f1dddb..8293faac4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java @@ -16,7 +16,7 @@ import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 2a62e25b2..5c8e4e4c6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -22,7 +22,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 0d1ec1ad1..b38da4569 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -27,7 +27,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; @ExtendWith(MockitoExtension.class) From 184e7b385675120faa14d120221ee7221a454ac0 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 27 Jan 2021 15:43:08 +0100 Subject: [PATCH 04/86] Implemented new Transformation using spark --- .../common/AggregationCounter.java | 45 + .../DnetTransformationException.java | 28 + .../dhp/transformation/TransformFunction.java | 74 - .../transformation/TransformSparkJobNode.java | 86 +- .../transformation/TransformationFactory.java | 62 + .../{functions => xslt}/Cleaner.java | 21 +- .../xslt/XSLTTransformationFunction.java | 66 + .../transformation_input_parameters.json | 38 +- .../transformation/TransformationJobTest.java | 163 +-- .../eu/dnetlib/dhp/transform/ext_simple.xsl | 9 +- .../eu/dnetlib/dhp/transform/input.xml | 99 +- .../eu/dnetlib/dhp/transform/synonyms.txt | 1234 +++++++++++++++++ .../eu/dnetlib/dhp/transform/terms.txt | 1080 +++++++++++++++ 13 files changed, 2704 insertions(+), 301 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/{functions => xslt}/Cleaner.java (61%) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java new file mode 100644 index 000000000..1ac2cb54b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java @@ -0,0 +1,45 @@ +package eu.dnetlib.dhp.aggregation.common; + +import org.apache.spark.util.LongAccumulator; + +import java.io.Serializable; + + +public class AggregationCounter implements Serializable { + private LongAccumulator totalItems; + private LongAccumulator errorItems; + private LongAccumulator processedItems; + + public AggregationCounter() { + } + + public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) { + this.totalItems = totalItems; + this.errorItems = errorItems; + this.processedItems = processedItems; + } + + public LongAccumulator getTotalItems() { + return totalItems; + } + + public void setTotalItems(LongAccumulator totalItems) { + this.totalItems = totalItems; + } + + public LongAccumulator getErrorItems() { + return errorItems; + } + + public void setErrorItems(LongAccumulator errorItems) { + this.errorItems = errorItems; + } + + public LongAccumulator getProcessedItems() { + return processedItems; + } + + public void setProcessedItems(LongAccumulator processedItems) { + this.processedItems = processedItems; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java new file mode 100644 index 000000000..2c932e40b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.transformation; + +public class DnetTransformationException extends Exception { + + public DnetTransformationException() { + super(); + } + + public DnetTransformationException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public DnetTransformationException(final String message, final Throwable cause) { + super(message, cause); + } + + public DnetTransformationException(final String message) { + super(message); + } + + public DnetTransformationException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java deleted file mode 100644 index f4bf78e18..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ /dev/null @@ -1,74 +0,0 @@ - -package eu.dnetlib.dhp.transformation; - -import java.io.ByteArrayInputStream; -import java.io.StringWriter; -import java.util.Map; - -import javax.xml.transform.stream.StreamSource; - -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import net.sf.saxon.s9api.*; - -public class TransformFunction implements MapFunction { - - private final LongAccumulator totalItems; - private final LongAccumulator errorItems; - private final LongAccumulator transformedItems; - private final String transformationRule; - private final Cleaner cleanFunction; - - private final long dateOfTransformation; - - public TransformFunction( - LongAccumulator totalItems, - LongAccumulator errorItems, - LongAccumulator transformedItems, - final String transformationRule, - long dateOfTransformation, - final Map vocabularies) - throws Exception { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.transformedItems = transformedItems; - this.transformationRule = transformationRule; - this.dateOfTransformation = dateOfTransformation; - cleanFunction = new Cleaner(vocabularies); - } - - @Override - public MetadataRecord call(MetadataRecord value) { - totalItems.add(1); - try { - Processor processor = new Processor(false); - processor.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = processor.newXsltCompiler(); - XsltExecutable xslt = comp - .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); - XdmNode source = processor - .newDocumentBuilder() - .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); - XsltTransformer trans = xslt.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = processor.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - final String xml = output.toString(); - value.setBody(xml); - value.setDateOfTransformation(dateOfTransformation); - transformedItems.add(1); - return value; - } catch (Throwable e) { - errorItems.add(1); - return null; - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 8737d36ef..6e07e5173 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -9,9 +9,15 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; -import org.apache.commons.cli.*; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; @@ -25,9 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.message.Message; @@ -57,65 +61,39 @@ public class TransformSparkJobNode { final String inputPath = parser.get("input"); final String outputPath = parser.get("output"); + // TODO this variable will be used after implementing Messaging with DNet Aggregator final String workflowId = parser.get("workflowId"); - final String trasformationRule = extractXSLTFromTR( - Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); - final String rabbitUser = parser.get("rabbitUser"); - final String rabbitPassword = parser.get("rabbitPassword"); - final String rabbitHost = parser.get("rabbitHost"); - final String rabbitReportQueue = parser.get("rabbitReportQueue"); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info(String.format("isLookupUrl: %s", isLookupUrl)); + + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, - spark -> { - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); - final Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = new TransformFunction( - totalItems, - errorItems, - transformedItems, - trasformationRule, - dateOfCollection, - vocabularies); - mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); - if (rabbitHost != null) { - System.out.println("SEND FINAL REPORT"); - final Map reportMap = new HashMap<>(); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + errorItems.value()); - reportMap.put("mdStoreSize", "" + transformedItems.value()); - System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); - if (!test) { - final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, - false, - null); - manager - .sendMessage( - new Message(workflowId, "Transform", MessageType.REPORT, reportMap), - rabbitReportQueue, - true, - false); - manager.close(); - } - } - }); - + spark -> transformRecords(parser.getObjectMap(), isLookupService, spark, inputPath, outputPath)); } - private static String extractXSLTFromTR(final String tr) throws DocumentException { - SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - return node.asXML(); + + public static void transformRecords(final Mapargs, final ISLookUpService isLookUpService, final SparkSession spark, final String inputPath, final String outputPath) throws DnetTransformationException { + + final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); + final AggregationCounter ct = new AggregationCounter(totalItems, errorItems,transformedItems ); + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); + final MapFunction XSLTTransformationFunction = TransformationFactory.getTransformationPlugin(args,ct, isLookUpService); + mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath); + + log.info("Transformed item "+ ct.getProcessedItems().count()); + log.info("Total item "+ ct.getTotalItems().count()); + log.info("Transformation Error item "+ ct.getErrorItems().count()); } + + + + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java new file mode 100644 index 000000000..0296458a5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -0,0 +1,62 @@ +package eu.dnetlib.dhp.transformation; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.MapFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; + +public class TransformationFactory { + + private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()"; + + + public static MapFunction getTransformationPlugin(final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) throws DnetTransformationException { + + try { + final String transformationPlugin = jobArgument.get("transformationPlugin"); + + log.info("Transformation plugin required "+transformationPlugin); + switch (transformationPlugin) { + case "XSLT_TRANSFORM": { + final String transformationRuleName = jobArgument.get("transformationRule"); + if (StringUtils.isBlank(transformationRuleName)) + throw new DnetTransformationException("Missing Parameter transformationRule"); + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + final String transformationRule = queryTransformationRuleFromIS(transformationRuleName, isLookupService); + + final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation")); + return new XSLTTransformationFunction(counters,transformationRule,dateOfTransformation,vocabularies); + + } + default: + throw new DnetTransformationException("transformation plugin does not exists for " + transformationPlugin); + + } + + } catch (Throwable e) { + throw new DnetTransformationException(e); + } + } + + private static String queryTransformationRuleFromIS(final String transformationRuleName, final ISLookUpService isLookUpService) throws Exception { + final String query = String.format(TRULE_XQUERY, transformationRuleName); + log.info("asking query to IS: "+ query); + List result = isLookUpService.quickSearchProfile(query); + + if (result==null || result.isEmpty()) + throw new DnetTransformationException("Unable to find transformation rule with name: "+ transformationRuleName); + return result.get(0); + } + + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java similarity index 61% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 7f9b6646c..2c6d776af 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -1,19 +1,17 @@ -package eu.dnetlib.dhp.transformation.functions; +package eu.dnetlib.dhp.transformation.xslt; -import java.util.Map; -import java.util.Optional; -import eu.dnetlib.dhp.transformation.vocabulary.Term; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import net.sf.saxon.s9api.*; import scala.Serializable; public class Cleaner implements ExtensionFunction, Serializable { - private final Map vocabularies; + private final VocabularyGroup vocabularies; - public Cleaner(Map vocabularies) { + public Cleaner(final VocabularyGroup vocabularies) { this.vocabularies = vocabularies; } @@ -39,14 +37,9 @@ public class Cleaner implements ExtensionFunction, Serializable { public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { final String currentValue = xdmValues[0].itemAt(0).getStringValue(); final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = vocabularies - .get(vocabularyName) - .getTerms() - .stream() - .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) - .findAny(); + Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue); return new XdmAtomicValue( - cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); + cleanedValue != null ? cleanedValue.getClassid() : currentValue); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java new file mode 100644 index 000000000..c02b83345 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -0,0 +1,66 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import net.sf.saxon.s9api.*; +import org.apache.spark.api.java.function.MapFunction; + +import javax.xml.transform.stream.StreamSource; +import java.io.ByteArrayInputStream; +import java.io.StringWriter; + +public class XSLTTransformationFunction implements MapFunction { + + private final AggregationCounter aggregationCounter; + + private final String transformationRule; + + private final Cleaner cleanFunction; + + private final long dateOfTransformation; + + public XSLTTransformationFunction( + final AggregationCounter aggregationCounter, + final String transformationRule, + long dateOfTransformation, + final VocabularyGroup vocabularies) + throws Exception { + this.aggregationCounter = aggregationCounter; + this.transformationRule = transformationRule; + this.dateOfTransformation = dateOfTransformation; + cleanFunction = new Cleaner(vocabularies); + } + + @Override + public MetadataRecord call(MetadataRecord value) { + aggregationCounter.getTotalItems().add(1); + try { + Processor processor = new Processor(false); + processor.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = processor.newXsltCompiler(); + XsltExecutable xslt = comp + .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); + XdmNode source = processor + .newDocumentBuilder() + .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); + XsltTransformer trans = xslt.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = processor.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + final String xml = output.toString(); + value.setBody(xml); + value.setDateOfTransformation(dateOfTransformation); + aggregationCounter.getProcessedItems().add(1); + return value; + } catch (Throwable e) { + aggregationCounter.getErrorItems().add(1); + return null; + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index 4bb5fd56a..fd2a96ea0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -7,7 +7,7 @@ }, { "paramName": "d", - "paramLongName": "dateOfCollection", + "paramLongName": "dateOfTransformation", "paramDescription": "the date when the record has been stored", "paramRequired": true }, @@ -36,39 +36,9 @@ "paramRequired": true }, { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", + "paramName": "tp", + "paramLongName": "transformationPlugin", + "paramDescription": "the transformation plugin to apply", "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", - "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 98c8cf66c..5479e0b57 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -2,15 +2,23 @@ package eu.dnetlib.dhp.transformation; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.mockito.Mockito.lenient; +import java.io.IOException; import java.io.StringWriter; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; -import java.util.Map; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.xml.transform.stream.StreamSource; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; @@ -18,28 +26,34 @@ import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; import org.dom4j.io.SAXReader; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; - import eu.dnetlib.dhp.collection.CollectionJobTest; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import net.sf.saxon.s9api.*; @ExtendWith(MockitoExtension.class) public class TransformationJobTest { private static SparkSession spark; + @Mock + private ISLookUpService isLookUpService; + + private VocabularyGroup vocabularies; + + @BeforeEach + public void setUp() throws ISLookUpException, IOException { + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + } + @BeforeAll public static void beforeAll() { SparkConf conf = new SparkConf(); @@ -53,64 +67,51 @@ public class TransformationJobTest { spark.stop(); } - @Mock - private LongAccumulator accumulator; @Test + @DisplayName("Test Transform Single XML using XSLTTransformator") public void testTransformSaxonHE() throws Exception { - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - Cleaner cleanFunction = new Cleaner(vocabularies); - Processor proc = new Processor(false); - proc.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = proc.newXsltCompiler(); - XsltExecutable exp = comp - .compile( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); - XdmNode source = proc - .newDocumentBuilder() - .build( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - XsltTransformer trans = exp.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = proc.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - System.out.println(output.toString()); + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + + + // We Load the XSLT trasformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/ext_simple.xsl"); + + //Print the record + System.out.println(tr.call(mr).getBody()); + //TODO Create significant Assert + } + + + @DisplayName("Test TransformSparkJobNode.main") @Test public void transformTest(@TempDir Path testDir) throws Exception { + final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); final String mdstore_output = testDir.toString() + "/version"; - final String xslt = DHPUtils - .compressString( - IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); - TransformSparkJobNode - .main( - new String[] { - "-issm", "true", - "-i", mdstore_input, - "-o", mdstore_output, - "-d", "1", - "-w", "1", - "-tr", xslt, - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", "" - }); + + + mockupTrasformationRule("simpleTRule","/eu/dnetlib/dhp/transform/ext_simple.xsl"); + +// final String arguments = "-issm true -i %s -o %s -d 1 -w 1 -tp XSLT_TRANSFORM -tr simpleTRule"; + + final Map parameters = Stream.of(new String[][] { + { "dateOfTransformation", "1234" }, + { "transformationPlugin", "XSLT_TRANSFORM" }, + { "transformationRule", "simpleTRule" }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters,isLookUpService,spark,mdstore_input, mdstore_output); + + + // TODO introduce useful assertions } @@ -127,39 +128,27 @@ public class TransformationJobTest { Files.deleteIfExists(tempDirWithPrefix); } - @Test - public void testTransformFunction() throws Exception { - SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - final String xslt = node.asXML(); - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - TransformFunction tf = new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); + private void mockupTrasformationRule(final String trule, final String path)throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); - MetadataRecord record = new MetadataRecord(); - record - .setBody( - IOUtils - .toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - - final MetadataRecord result = tf.call(record); - assertNotNull(result.getBody()); - - System.out.println(result.getBody()); + lenient().when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY,trule))) + .thenReturn(Collections.singletonList(trValue)); } - @Test - public void extractTr() throws Exception { + private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); + final LongAccumulator la = new LongAccumulator(); + return new XSLTTransformationFunction(new AggregationCounter(la,la,la),trValue, 0,vocabularies); + } - final String xmlTr = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + private List vocs() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); + } - SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - - System.out.println(node.asXML()); + private List synonyms() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index f22db961b..9e5f84c11 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -1,15 +1,16 @@ + exclude-result-prefixes="xsl vocabulary"> - - + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml index 8760d3117..8efb3c487 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml @@ -1,37 +1,68 @@ - -
- oai:research.chalmers.se:243692 - 2018-01-25T18:04:43Z - openaire -
- - - Incipient Berezinskii-Kosterlitz-Thouless transition in two-dimensional coplanar Josephson junctions - https://research.chalmers.se/en/publication/243692 - 2016 - Massarotti, D. - Jouault, B. - Rouco, V. - Charpentier, Sophie - Bauch, Thilo - Michon, A. - De Candia, A. - Lucignano, P. - Lombardi, Floriana - Tafuri, F. - Tagliacozzo, A. - Acoli - Abkhazian - Condensed Matter Physics - Superconducting hybrid junctions are revealing a variety of effects. Some of them are due to the special layout of these devices, which often use a coplanar configuration with relatively large barrier channels and the possibility of hosting Pearl vortices. A Josephson junction with a quasi-ideal two-dimensional barrier has been realized by growing graphene on SiC with Al electrodes. Chemical vapor deposition offers centimeter size monolayer areas where it is possible to realize a comparative analysis of different devices with nominally the same barrier. In samples with a graphene gap below 400 nm, we have found evidence of Josephson coherence in the presence of an incipient Berezinskii-Kosterlitz-Thouless transition. When the magnetic field is cycled, a remarkable hysteretic collapse and revival of the Josephson supercurrent occurs. Similar hysteresis are found in granular systems and are usually justified within the Bean critical state model (CSM). We show that the CSM, with appropriate account for the low-dimensional geometry, can partly explain the odd features measured in these junctions. - info:eu-repo/grantAgreement/EC/FP7/604391//Graphene-Based Revolutions in ICT And Beyond (Graphene Flagship)/ - info:eu-repo/semantics/altIdentifier/doi/10.1103/PhysRevB.94.054525 - info:eu-repo/semantics/article - Physical Review B vol.94(2016) - info:eu-repo/semantics/openAccess + + + + od______2294::00029b7f0a2a7e090e55b625a9079d83 + oai:pub.uni-bielefeld.de:2578942 + 2018-11-23T15:15:33.974+01:00 + od______2294 + oai:pub.uni-bielefeld.de:2578942 + 2018-07-24T13:01:16Z + conference + ddc:000 + conferenceFtxt + driver + open_access + + + + Mobile recommendation agents making online use of visual attention information at the point of sale + Pfeiffer, Thies + Pfeiffer, Jella + Meißner, Martin + Davis, Fred + Riedl, René + Jan, vom Brocke + Léger, Pierre-Majorique + Randolph, Adriane + Mobile Cognitive Assistance Systems + Information Systems + ddc:000 + We aim to utilize online information about visual attention for developing mobile recommendation agents (RAs) for use at the point of sale. Up to now, most RAs are focussed exclusively at personalization in an e-commerce setting. Very little is known, however, about mobile RAs that offer information and assistance at the point of sale based on individual-level feature based preference models (Murray and Häubl 2009). Current attempts provide information about products at the point of sale by manually scanning barcodes or using RFID (Kowatsch et al. 2011, Heijden 2005), e.g. using specific apps for smartphones. We argue that an online access to the current visual attention of the user offers a much larger potential. Integrating mobile eye tracking into ordinary glasses would yield a direct benefit of applying neuroscience methods in the user’s everyday life. First, learning from consumers’ attentional processes over time and adapting recommendations based on this learning allows us to provide very accurate and relevant recommendations, potentially increasing the perceived usefulness. Second, our proposed system needs little explicit user input (no scanning or navigation on screen) making it easy to use. Thus, instead of learning from click behaviour and past customer ratings, as it is the case in the e-commerce setting, the mobile RA learns from eye movements by participating online in every day decision processes. We argue that mobile RAs should be built based on current research in human judgment and decision making (Murray et al. 2010). In our project, we therefore follow a two-step approach: In the empirical basic research stream, we aim to understand the user’s interaction with the product shelf: the actions and patterns of user’s behaviour (eye movements, gestures, approaching a product closer) and their correspondence to the user’s informational needs. In the empirical system development stream, we create prototypes of mobile RAs and test experimentally the factors that influence the user’s adoption. For example, we suggest that a user’s involvement in the process, such as a need for exact nutritional information or for assistance (e.g., reading support for elderly) will influence the user’s intention to use such as system. The experiments are conducted both in our immersive virtual reality supermarket presented in a CAVE, where we can also easily display information to the user and track the eye movement in great accuracy, as well as in real-world supermarkets (see Figure 1), so that the findings can be better generalized to natural decision situations (Gidlöf et al. 2013). In a first pilot study with five randomly chosen participants in a supermarket, we evaluated which sort of mobile RAs consumers favour in order to get a first impression of the user’s acceptance of the technology. Figure 1 shows an excerpt of one consumer’s eye movements during a decision process. First results show long eye cascades and short fixations on many products in situations where users are uncertain and in need for support. Furthermore, we find a surprising acceptance of the technology itself throughout all ages (23 – 61 years). At the same time, consumers express serious fear of being manipulated by such a technology. For that reason, they strongly prefer the information to be provided by trusted third party or shared with family members and friends (see also Murray and Häubl 2009). Our pilot will be followed by a larger field experiment in March in order to learn more about factors that influence the user’s acceptance as well as the eye movement patterns that reflect typical phases of decision processes and indicate the need for support by a RA. + 2013 + info:eu-repo/semantics/conferenceObject + doc-type:conferenceObject + text + https://pub.uni-bielefeld.de/record/2578942 + https://pub.uni-bielefeld.de/download/2578942/2602478 + Pfeiffer T, Pfeiffer J, Meißner M. Mobile recommendation agents making online use of visual attention information at the point of sale. In: Davis F, Riedl R, Jan vom B, Léger P-M, Randolph A, eds. Proceedings of the Gmunden Retreat on NeuroIS 2013. 2013: 3-3. eng - Researchers - application/pdf + info:eu-repo/semantics/openAccess -
+ + + + http://pub.uni-bielefeld.de/oai + oai:pub.uni-bielefeld.de:2578942 + 2018-07-24T13:01:16Z + http://www.openarchives.org/OAI/2.0/oai_dc/ + + + + false + false + 0.9 + + + + +
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt new file mode 100644 index 000000000..729296522 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt @@ -0,0 +1,1234 @@ +dnet:access_modes @=@ CLOSED @=@ http://purl.org/coar/access_right/c_14cb +dnet:access_modes @=@ CLOSED @=@ info:eu-repo/semantics/closedAccess +dnet:access_modes @=@ EMBARGO @=@ http://purl.org/coar/access_right/c_f1cf +dnet:access_modes @=@ EMBARGO @=@ info:eu-repo/semantics/embargoedAccess +dnet:access_modes @=@ OPEN @=@ Creative Commons License [CC BY-NC-ND] http://creativecommons.org/licenses/by-nc-nd/3.0/de/ +dnet:access_modes @=@ OPEN @=@ Creative commons +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc-nd/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/us/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ & http://www.canadensys.net/norms +dnet:access_modes @=@ OPEN @=@ http://purl.org/coar/access_right/c_abf2 +dnet:access_modes @=@ OPEN @=@ https://creativecommons.org/licenses/by-nc/4.0/ +dnet:access_modes @=@ OPEN @=@ info:eu-repo/semantics/openAccess +dnet:access_modes @=@ OPEN @=@ open_access +dnet:access_modes @=@ RESTRICTED @=@ http://purl.org/coar/access_right/c_16ec +dnet:access_modes @=@ RESTRICTED @=@ info:eu-repo/semantics/restrictedAccess +dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ openaire4.0 +dnet:subject_classification_typologies @=@ jel @=@ jelElement +dnet:publication_resource @=@ 0018 @=@ Comment/debate +dnet:publication_resource @=@ 0018 @=@ http://purl.org/coar/resource_type/c_1162 +dnet:publication_resource @=@ 0018 @=@ info:eu-repo/semantics/annotation +dnet:publication_resource @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ 0001 @=@ Article (author) +dnet:publication_resource @=@ 0001 @=@ Article - letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Newspaper +dnet:publication_resource @=@ 0001 @=@ Article in journal +dnet:publication_resource @=@ 0001 @=@ Article in monograph or in proceedings +dnet:publication_resource @=@ 0001 @=@ Article in proceedings +dnet:publication_resource @=@ 0001 @=@ Article-letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article/Letter to editor +dnet:publication_resource @=@ 0001 @=@ Articolo +dnet:publication_resource @=@ 0001 @=@ Artículo +dnet:publication_resource @=@ 0001 @=@ Aufsatz +dnet:publication_resource @=@ 0001 @=@ Clinical Study +dnet:publication_resource @=@ 0001 @=@ Institutional Series +dnet:publication_resource @=@ 0001 @=@ International Journal +dnet:publication_resource @=@ 0001 @=@ International Journal Abstract +dnet:publication_resource @=@ 0001 @=@ International Journal ISI/JCR +dnet:publication_resource @=@ 0001 @=@ Journal (full / special issue) +dnet:publication_resource @=@ 0001 @=@ Journal Article/Review +dnet:publication_resource @=@ 0001 @=@ Journal article +dnet:publication_resource @=@ 0001 @=@ Journal article (on-line or printed) +dnet:publication_resource @=@ 0001 @=@ Journal articles +dnet:publication_resource @=@ 0001 @=@ Journal paper +dnet:publication_resource @=@ 0001 @=@ National Journal +dnet:publication_resource @=@ 0001 @=@ Original article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Original article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Peer-reviewed Article +dnet:publication_resource @=@ 0001 @=@ Published Journal Article +dnet:publication_resource @=@ 0001 @=@ Research Article +dnet:publication_resource @=@ 0001 @=@ Review article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Review article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Volumes Edited / Special Issues +dnet:publication_resource @=@ 0001 @=@ article in non peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article in peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article-commentary +dnet:publication_resource @=@ 0001 @=@ article_site_web +dnet:publication_resource @=@ 0001 @=@ doc-type:Journal Article +dnet:publication_resource @=@ 0001 @=@ doc-type:article +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_2df8fbb1 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_545b +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_6501 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_7877 +dnet:publication_resource @=@ 0001 @=@ in-brief +dnet:publication_resource @=@ 0001 @=@ info:eu-repo/semantics/article +dnet:publication_resource @=@ 0001 @=@ journal-article +dnet:publication_resource @=@ 0001 @=@ journalArticle +dnet:publication_resource @=@ 0001 @=@ journal_article +dnet:publication_resource @=@ 0001 @=@ letter +dnet:publication_resource @=@ 0001 @=@ non peer-reviewed article +dnet:publication_resource @=@ 0001 @=@ partial-retraction +dnet:publication_resource @=@ 0001 @=@ proceeding with peer review +dnet:publication_resource @=@ 0001 @=@ publication-article +dnet:publication_resource @=@ 0001 @=@ rapid-communication +dnet:publication_resource @=@ 0001 @=@ reply +dnet:publication_resource @=@ 0001 @=@ research-article +dnet:publication_resource @=@ 0001 @=@ retraction +dnet:publication_resource @=@ 0001 @=@ review-article +dnet:publication_resource @=@ 0001 @=@ text (article) +dnet:publication_resource @=@ 0001 @=@ Статья +dnet:publication_resource @=@ 0001 @=@ ArticleArtikel +dnet:publication_resource @=@ 0033 @=@ AUDIOVISUAL_DOCUMENT +dnet:publication_resource @=@ 0033 @=@ Audiovisual/Audiovisual +dnet:publication_resource @=@ 0033 @=@ http://purl.org/coar/resource_type/c_c513 +dnet:publication_resource @=@ 0008 @=@ Bachelor's +dnet:publication_resource @=@ 0008 @=@ Bachelor's Degree +dnet:publication_resource @=@ 0008 @=@ Bachelors Thesis +dnet:publication_resource @=@ 0008 @=@ Proyecto fin de carrera +dnet:publication_resource @=@ 0008 @=@ Undergraduate Thesis +dnet:publication_resource @=@ 0008 @=@ http://purl.org/coar/resource_type/c_7a1f +dnet:publication_resource @=@ 0008 @=@ info:eu-repo/semantics/bachelorThesis +dnet:publication_resource @=@ 0008 @=@ выпускная бакалаврская работа +dnet:publication_resource @=@ 0002 @=@ Book (monograph) +dnet:publication_resource @=@ 0002 @=@ Book (non peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book (peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book - monograph - editorial book +dnet:publication_resource @=@ 0002 @=@ Book Section +dnet:publication_resource @=@ 0002 @=@ Book as author +dnet:publication_resource @=@ 0002 @=@ Buch +dnet:publication_resource @=@ 0002 @=@ International Book/Monograph +dnet:publication_resource @=@ 0002 @=@ Libro +dnet:publication_resource @=@ 0002 @=@ Monografia +dnet:publication_resource @=@ 0002 @=@ Monograph +dnet:publication_resource @=@ 0002 @=@ National Book/Monograph +dnet:publication_resource @=@ 0002 @=@ atlas +dnet:publication_resource @=@ 0002 @=@ book +dnet:publication_resource @=@ 0002 @=@ book-series +dnet:publication_resource @=@ 0002 @=@ book-set +dnet:publication_resource @=@ 0002 @=@ book-track +dnet:publication_resource @=@ 0002 @=@ book_series +dnet:publication_resource @=@ 0002 @=@ book_title +dnet:publication_resource @=@ 0002 @=@ doc-type:book +dnet:publication_resource @=@ 0002 @=@ edited-book +dnet:publication_resource @=@ 0002 @=@ http://purl.org/coar/resource_type/c_2f33 +dnet:publication_resource @=@ 0002 @=@ info:eu-repo/semantics/book +dnet:publication_resource @=@ 0002 @=@ ouvrage +dnet:publication_resource @=@ 0002 @=@ publication-book +dnet:publication_resource @=@ 0002 @=@ reference-book +dnet:publication_resource @=@ 0002 @=@ scientific book +dnet:publication_resource @=@ 0002 @=@ Монография +dnet:publication_resource @=@ 0002 @=@ Учебник +dnet:publication_resource @=@ 0037 @=@ clinicalTrial +dnet:publication_resource @=@ 0037 @=@ http://purl.org/coar/resource_type/c_cb28 +dnet:publication_resource @=@ 0022 @=@ collection +dnet:publication_resource @=@ 0004 @=@ A4 Artikkeli konferenssijulkaisussa +dnet:publication_resource @=@ 0004 @=@ Comunicación de congreso +dnet:publication_resource @=@ 0004 @=@ Conference Paper +dnet:publication_resource @=@ 0004 @=@ Conference Paper/Proceeding/Abstract +dnet:publication_resource @=@ 0004 @=@ Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Conference article +dnet:publication_resource @=@ 0004 @=@ Conference contribution +dnet:publication_resource @=@ 0004 @=@ Conference lecture +dnet:publication_resource @=@ 0004 @=@ Conference or Workshop Item +dnet:publication_resource @=@ 0004 @=@ Conference paper, poster, etc. +dnet:publication_resource @=@ 0004 @=@ Conference papers +dnet:publication_resource @=@ 0004 @=@ Conference report +dnet:publication_resource @=@ 0004 @=@ International Conference +dnet:publication_resource @=@ 0004 @=@ International Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ International Conference ISI/JCR +dnet:publication_resource @=@ 0004 @=@ International Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ National Conference +dnet:publication_resource @=@ 0004 @=@ National Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ National Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ PREFACE_PROCEEDINGS +dnet:publication_resource @=@ 0004 @=@ PROCEEDING_PAPER +dnet:publication_resource @=@ 0004 @=@ Papers in Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Presentación +dnet:publication_resource @=@ 0004 @=@ Proceedings (peer-reviewed) +dnet:publication_resource @=@ 0004 @=@ Proceedings of a Conference +dnet:publication_resource @=@ 0004 @=@ Proceedings paper +dnet:publication_resource @=@ 0004 @=@ Póster +dnet:publication_resource @=@ 0004 @=@ actes_congres +dnet:publication_resource @=@ 0004 @=@ communication_avec_actes +dnet:publication_resource @=@ 0004 @=@ communication_invitee +dnet:publication_resource @=@ 0004 @=@ communication_par_affiche +dnet:publication_resource @=@ 0004 @=@ communication_sans_actes +dnet:publication_resource @=@ 0004 @=@ conference +dnet:publication_resource @=@ 0004 @=@ conference item +dnet:publication_resource @=@ 0004 @=@ conference proceeding +dnet:publication_resource @=@ 0004 @=@ conferenceObject +dnet:publication_resource @=@ 0004 @=@ conference_paper +dnet:publication_resource @=@ 0004 @=@ doc-type:conferenceObject +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18co +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18cp +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_5794 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_6670 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_c94f +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_f744 +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceItem +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceObject +dnet:publication_resource @=@ 0004 @=@ invited conference talk +dnet:publication_resource @=@ 0004 @=@ poster +dnet:publication_resource @=@ 0004 @=@ presentation +dnet:publication_resource @=@ 0004 @=@ proceeding, seminar, workshop without peer review +dnet:publication_resource @=@ 0004 @=@ proceedings +dnet:publication_resource @=@ 0004 @=@ proceedings-article +dnet:publication_resource @=@ 0004 @=@ publication-conferencepaper +dnet:publication_resource @=@ 0004 @=@ научный доклад +dnet:publication_resource @=@ 0005 @=@ Newspaper or magazine article +dnet:publication_resource @=@ 0005 @=@ http://purl.org/coar/resource_type/c_998f +dnet:publication_resource @=@ 0005 @=@ info:eu-repo/semantics/contributionToPeriodical +dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ 0045 @=@ Data Management Plan (NSF Generic) +dnet:publication_resource @=@ 0045 @=@ http://purl.org/coar/resource_type/c_ab20 +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicy +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicyDocument +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataMangementPlan +dnet:publication_resource @=@ 0045 @=@ plan de gestión de datos +dnet:publication_resource @=@ 0045 @=@ publication-datamanagementplan +dnet:publication_resource @=@ 0031 @=@ Data Descriptor +dnet:publication_resource @=@ 0031 @=@ DataPaper +dnet:publication_resource @=@ 0031 @=@ data-article +dnet:publication_resource @=@ 0031 @=@ http://purl.org/coar/resource_type/c_beb9 +dnet:publication_resource @=@ 0021 @=@ Dataset/Dataset +dnet:publication_resource @=@ 0021 @=@ Research Data +dnet:publication_resource @=@ 0021 @=@ dataset +dnet:publication_resource @=@ 0021 @=@ http://purl.org/coar/resource_type/c_ddb1 +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/DDIInstance +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/datafile +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/dataset +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/enhancedObjectFile +dnet:publication_resource @=@ 0006 @=@ Diss +dnet:publication_resource @=@ 0006 @=@ Dissertation +dnet:publication_resource @=@ 0006 @=@ Doctoral +dnet:publication_resource @=@ 0006 @=@ DoctoralThesis +dnet:publication_resource @=@ 0006 @=@ PhD thesis +dnet:publication_resource @=@ 0006 @=@ Tesis +dnet:publication_resource @=@ 0006 @=@ Text.Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ Theses +dnet:publication_resource @=@ 0006 @=@ Thesis +dnet:publication_resource @=@ 0006 @=@ Thesis or Dissertation +dnet:publication_resource @=@ 0006 @=@ Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ doc-type:doctoralThesis +dnet:publication_resource @=@ 0006 @=@ http://purl.org/coar/resource_type/c_db06 +dnet:publication_resource @=@ 0006 @=@ info:eu-repo/semantics/doctoralThesis +dnet:publication_resource @=@ 0006 @=@ publication-thesis +dnet:publication_resource @=@ 0006 @=@ these +dnet:publication_resource @=@ 0006 @=@ these exercice +dnet:publication_resource @=@ 0023 @=@ Event/Event +dnet:publication_resource @=@ 0023 @=@ event +dnet:publication_resource @=@ 0009 @=@ Departmental Technical Report +dnet:publication_resource @=@ 0009 @=@ Informe Técnico +dnet:publication_resource @=@ 0009 @=@ RESEARCH_REPORT +dnet:publication_resource @=@ 0009 @=@ Tech-Report +dnet:publication_resource @=@ 0009 @=@ Technical Report +dnet:publication_resource @=@ 0009 @=@ http://purl.org/coar/resource_type/c_18gh +dnet:publication_resource @=@ 0009 @=@ publication-technicalnote +dnet:publication_resource @=@ 0009 @=@ research report +dnet:publication_resource @=@ 0024 @=@ Video +dnet:publication_resource @=@ 0024 @=@ film +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_12ce +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_8a7e +dnet:publication_resource @=@ 0025 @=@ Diagram +dnet:publication_resource @=@ 0025 @=@ Drawing +dnet:publication_resource @=@ 0025 @=@ Figure +dnet:publication_resource @=@ 0025 @=@ Image/Image +dnet:publication_resource @=@ 0025 @=@ Imagen +dnet:publication_resource @=@ 0025 @=@ Photo +dnet:publication_resource @=@ 0025 @=@ Plot +dnet:publication_resource @=@ 0025 @=@ fotó +dnet:publication_resource @=@ 0025 @=@ grafika +dnet:publication_resource @=@ 0025 @=@ http://purl.org/coar/resource_type/c_ecc8 +dnet:publication_resource @=@ 0025 @=@ image +dnet:publication_resource @=@ 0025 @=@ image-diagram +dnet:publication_resource @=@ 0025 @=@ image-drawing +dnet:publication_resource @=@ 0025 @=@ image-figure +dnet:publication_resource @=@ 0025 @=@ image-other +dnet:publication_resource @=@ 0025 @=@ image-photo +dnet:publication_resource @=@ 0025 @=@ image-plot +dnet:publication_resource @=@ 0026 @=@ http://purl.org/coar/resource_type/c_e9a0 +dnet:publication_resource @=@ 0026 @=@ interactiveResource +dnet:publication_resource @=@ 0011 @=@ Internal note +dnet:publication_resource @=@ 0011 @=@ http://purl.org/coar/resource_type/c_18ww +dnet:publication_resource @=@ 0043 @=@ http://purl.org/coar/resource_type/c_0640 +dnet:publication_resource @=@ 0010 @=@ Inaugural lecture +dnet:publication_resource @=@ 0010 @=@ Material didáctico +dnet:publication_resource @=@ 0010 @=@ Public-Lecture +dnet:publication_resource @=@ 0010 @=@ http://purl.org/coar/resource_type/c_8544 +dnet:publication_resource @=@ 0010 @=@ info:eu-repo/semantics/lecture +dnet:publication_resource @=@ 0010 @=@ lesson +dnet:publication_resource @=@ 0010 @=@ Учебный материал +dnet:publication_resource @=@ 0007 @=@ Diploma Project +dnet:publication_resource @=@ 0007 @=@ MSc Thesis +dnet:publication_resource @=@ 0007 @=@ Master Degree +dnet:publication_resource @=@ 0007 @=@ Master's +dnet:publication_resource @=@ 0007 @=@ Masterarbeit u.a. +dnet:publication_resource @=@ 0007 @=@ Masters (Taught) +dnet:publication_resource @=@ 0007 @=@ Masters thesis +dnet:publication_resource @=@ 0007 @=@ Masters-Thesis.Magister +dnet:publication_resource @=@ 0007 @=@ Tesina +dnet:publication_resource @=@ 0007 @=@ Thesis.Master +dnet:publication_resource @=@ 0007 @=@ Trabajo fin de Máster +dnet:publication_resource @=@ 0007 @=@ doc-type:masterThesis +dnet:publication_resource @=@ 0007 @=@ hdr +dnet:publication_resource @=@ 0007 @=@ http://purl.org/coar/resource_type/c_bdcc +dnet:publication_resource @=@ 0007 @=@ info:eu-repo/semantics/masterThesis +dnet:publication_resource @=@ 0007 @=@ masterThesis +dnet:publication_resource @=@ 0007 @=@ memoire +dnet:publication_resource @=@ 0027 @=@ Model/Model +dnet:publication_resource @=@ 0027 @=@ model +dnet:publication_resource @=@ 0020 @=@ Exhibition +dnet:publication_resource @=@ 0020 @=@ Learning Object +dnet:publication_resource @=@ 0020 @=@ Mapa +dnet:publication_resource @=@ 0020 @=@ Modelo de utilidad +dnet:publication_resource @=@ 0020 @=@ PEDAGOGICAL_DOCUMENT +dnet:publication_resource @=@ 0020 @=@ Partitura +dnet:publication_resource @=@ 0020 @=@ Sitio web +dnet:publication_resource @=@ 0020 @=@ Trabajo de divulgación +dnet:publication_resource @=@ 0020 @=@ Web publication/site +dnet:publication_resource @=@ 0020 @=@ application +dnet:publication_resource @=@ 0020 @=@ artefact +dnet:publication_resource @=@ 0020 @=@ carte +dnet:publication_resource @=@ 0020 @=@ composition +dnet:publication_resource @=@ 0020 @=@ document_audiovisuel +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cc +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_1843 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cw +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_26e4 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_7ad9 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_e059 +dnet:publication_resource @=@ 0020 @=@ info:eu-repo/semantics/other +dnet:publication_resource @=@ 0020 @=@ learningObject +dnet:publication_resource @=@ 0020 @=@ map +dnet:publication_resource @=@ 0020 @=@ misc +dnet:publication_resource @=@ 0020 @=@ other +dnet:publication_resource @=@ 0020 @=@ revue +dnet:publication_resource @=@ 0038 @=@ Abstract +dnet:publication_resource @=@ 0038 @=@ Blog +dnet:publication_resource @=@ 0038 @=@ Book Prospectus +dnet:publication_resource @=@ 0038 @=@ Dictionary Entry +dnet:publication_resource @=@ 0038 @=@ Disclosure +dnet:publication_resource @=@ 0038 @=@ Editorial +dnet:publication_resource @=@ 0038 @=@ Editorial ISI/JCR +dnet:publication_resource @=@ 0038 @=@ Editors +dnet:publication_resource @=@ 0038 @=@ Editors (non peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Editors (peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Encyclopedia Entry +dnet:publication_resource @=@ 0038 @=@ Entrada de blog +dnet:publication_resource @=@ 0038 @=@ Funding Submission +dnet:publication_resource @=@ 0038 @=@ HabilitationThesis +dnet:publication_resource @=@ 0038 @=@ License +dnet:publication_resource @=@ 0038 @=@ Manual +dnet:publication_resource @=@ 0038 @=@ Manuscript +dnet:publication_resource @=@ 0038 @=@ Manuscrito +dnet:publication_resource @=@ 0038 @=@ Other publication (non peer-review) +dnet:publication_resource @=@ 0038 @=@ Other publication (peer-review) +dnet:publication_resource @=@ 0038 @=@ Revista +dnet:publication_resource @=@ 0038 @=@ Supervised Student Publication +dnet:publication_resource @=@ 0038 @=@ Tesis/trabajos de grado – Thesis +dnet:publication_resource @=@ 0038 @=@ Text +dnet:publication_resource @=@ 0038 @=@ Text/Text +dnet:publication_resource @=@ 0038 @=@ Trademark +dnet:publication_resource @=@ 0038 @=@ Translation +dnet:publication_resource @=@ 0038 @=@ afterword +dnet:publication_resource @=@ 0038 @=@ avantpropos +dnet:publication_resource @=@ 0038 @=@ bibliography +dnet:publication_resource @=@ 0038 @=@ chronique +dnet:publication_resource @=@ 0038 @=@ compte rendu +dnet:publication_resource @=@ 0038 @=@ correction +dnet:publication_resource @=@ 0038 @=@ foreword +dnet:publication_resource @=@ 0038 @=@ habilitation à diriger des recherches +dnet:publication_resource @=@ 0038 @=@ historicalDocument +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0040 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0857 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18cf +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18wz +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_3e5a +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_46ec +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_6947 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_7acd +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_86bc +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_b239 +dnet:publication_resource @=@ 0038 @=@ note de lecture +dnet:publication_resource @=@ 0038 @=@ notedelecture +dnet:publication_resource @=@ 0038 @=@ other publication +dnet:publication_resource @=@ 0038 @=@ postface +dnet:publication_resource @=@ 0038 @=@ publication-other +dnet:publication_resource @=@ 0038 @=@ revuedepresse +dnet:publication_resource @=@ 0038 @=@ sa_component +dnet:publication_resource @=@ 0038 @=@ standard +dnet:publication_resource @=@ 0038 @=@ standard-series +dnet:publication_resource @=@ 0013 @=@ A3 Kirjan tai muun kokoomateoksen osa +dnet:publication_resource @=@ 0013 @=@ Book Part (author) +dnet:publication_resource @=@ 0013 @=@ Book Section / Chapter +dnet:publication_resource @=@ 0013 @=@ Book chapter or Essay in book +dnet:publication_resource @=@ 0013 @=@ Book editorial +dnet:publication_resource @=@ 0013 @=@ Book section +dnet:publication_resource @=@ 0013 @=@ Book_Chapter +dnet:publication_resource @=@ 0013 @=@ Buchbeitrag +dnet:publication_resource @=@ 0013 @=@ Capítulo de libro +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph ISI/JCR +dnet:publication_resource @=@ 0013 @=@ Contribution to National Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to book (non peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Contribution to book (peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Part of book - chapter +dnet:publication_resource @=@ 0013 @=@ book chapter +dnet:publication_resource @=@ 0013 @=@ book-part +dnet:publication_resource @=@ 0013 @=@ bookPart +dnet:publication_resource @=@ 0013 @=@ book_content +dnet:publication_resource @=@ 0013 @=@ chapitre_ouvrage +dnet:publication_resource @=@ 0013 @=@ chapter +dnet:publication_resource @=@ 0013 @=@ doc-type:bookPart +dnet:publication_resource @=@ 0013 @=@ http://purl.org/coar/resource_type/c_3248 +dnet:publication_resource @=@ 0013 @=@ info:eu-repo/semantics/bookPart +dnet:publication_resource @=@ 0013 @=@ publication-section +dnet:publication_resource @=@ 0013 @=@ reference-entry +dnet:publication_resource @=@ 0013 @=@ reference_entry +dnet:publication_resource @=@ 0013 @=@ scientific book chapter +dnet:publication_resource @=@ 0013 @=@ Глава монографии +dnet:publication_resource @=@ 0019 @=@ H1 Myönnetty patentti +dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ 0019 @=@ Patente +dnet:publication_resource @=@ 0019 @=@ Solicitud de patente +dnet:publication_resource @=@ 0019 @=@ Traducción de patente +dnet:publication_resource @=@ 0019 @=@ brevet +dnet:publication_resource @=@ 0019 @=@ http://purl.org/coar/resource_type/c_15cd +dnet:publication_resource @=@ 0019 @=@ info:eu-repo/semantics/patent +dnet:publication_resource @=@ 0019 @=@ publication-patent +dnet:publication_resource @=@ 0028 @=@ Service +dnet:publication_resource @=@ 0028 @=@ physicalObject +dnet:publication_resource @=@ 0016 @=@ Pre Print +dnet:publication_resource @=@ 0016 @=@ Pre-print +dnet:publication_resource @=@ 0016 @=@ http://purl.org/coar/resource_type/c_816b +dnet:publication_resource @=@ 0016 @=@ info:eu-repo/semantics/preprint +dnet:publication_resource @=@ 0016 @=@ publication-preprint +dnet:publication_resource @=@ 0016 @=@ Препринт +dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ 0034 @=@ http://purl.org/coar/resource_type/c_18op +dnet:publication_resource @=@ 0034 @=@ publication-deliverable +dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ 0035 @=@ publication-milestone +dnet:publication_resource @=@ 0036 @=@ Proposal +dnet:publication_resource @=@ 0036 @=@ http://purl.org/coar/resource_type/c_baaf +dnet:publication_resource @=@ 0036 @=@ research-proposal +dnet:publication_resource @=@ 0017 @=@ ACTIVITY_REPORT +dnet:publication_resource @=@ 0017 @=@ Commissioned report +dnet:publication_resource @=@ 0017 @=@ D4 Julkaistu kehittämis- tai tutkimusraportti tai -selvitys +dnet:publication_resource @=@ 0017 @=@ Deliverable +dnet:publication_resource @=@ 0017 @=@ Documento tecnico +dnet:publication_resource @=@ 0017 @=@ Project Report +dnet:publication_resource @=@ 0017 @=@ Software documentation +dnet:publication_resource @=@ 0017 @=@ brief-report +dnet:publication_resource @=@ 0017 @=@ case-report +dnet:publication_resource @=@ 0017 @=@ chapitre_rapport +dnet:publication_resource @=@ 0017 @=@ doc-type:report +dnet:publication_resource @=@ 0017 @=@ document_institutionnel +dnet:publication_resource @=@ 0017 @=@ document_technique +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_186u +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18hj +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18wq +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18ws +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_71bd +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_93fc +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_ba1f +dnet:publication_resource @=@ 0017 @=@ info:eu-repo/semantics/report +dnet:publication_resource @=@ 0017 @=@ publication-report +dnet:publication_resource @=@ 0017 @=@ publication-softwaredocumentation +dnet:publication_resource @=@ 0017 @=@ rapport_expertise +dnet:publication_resource @=@ 0017 @=@ rapport_mission +dnet:publication_resource @=@ 0017 @=@ report +dnet:publication_resource @=@ 0017 @=@ report-paper +dnet:publication_resource @=@ 0017 @=@ report-paper_title +dnet:publication_resource @=@ 0017 @=@ report-series +dnet:publication_resource @=@ 0017 @=@ support_cours +dnet:publication_resource @=@ 0014 @=@ Arbeitspapier +dnet:publication_resource @=@ 0014 @=@ Departmental Bulletin Paper +dnet:publication_resource @=@ 0014 @=@ Documento de trabajo +dnet:publication_resource @=@ 0014 @=@ Paper +dnet:publication_resource @=@ 0014 @=@ Project description +dnet:publication_resource @=@ 0014 @=@ Research-Paper +dnet:publication_resource @=@ 0014 @=@ ResearchPaper +dnet:publication_resource @=@ 0014 @=@ Working / discussion paper +dnet:publication_resource @=@ 0014 @=@ Working Paper +dnet:publication_resource @=@ 0014 @=@ Working Paper / Technical Report +dnet:publication_resource @=@ 0014 @=@ doc-type:workingPaper +dnet:publication_resource @=@ 0014 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/paper +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/workingPaper +dnet:publication_resource @=@ 0014 @=@ publication-workingpaper +dnet:publication_resource @=@ 0014 @=@ workingPaper +dnet:publication_resource @=@ 0015 @=@ A2 Katsausartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0015 @=@ Book Review +dnet:publication_resource @=@ 0015 @=@ Book/Film/Article review +dnet:publication_resource @=@ 0015 @=@ Literature review +dnet:publication_resource @=@ 0015 @=@ Peer review +dnet:publication_resource @=@ 0015 @=@ Reseña bibliográfica +dnet:publication_resource @=@ 0015 @=@ Review Article +dnet:publication_resource @=@ 0015 @=@ RezensionReview +dnet:publication_resource @=@ 0015 @=@ book-review +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_ba08 +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_dcae04bc +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_efa0 +dnet:publication_resource @=@ 0015 @=@ info:eu-repo/semantics/review +dnet:publication_resource @=@ 0015 @=@ peer-review +dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ 0029 @=@ Software/Software +dnet:publication_resource @=@ 0029 @=@ Workflow +dnet:publication_resource @=@ 0029 @=@ Workflow/Workflow +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_393c +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_5ce6 +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_c950 +dnet:publication_resource @=@ 0032 @=@ http://purl.org/coar/resource_type/c_7bab +dnet:publication_resource @=@ 0030 @=@ http://purl.org/coar/resource_type/c_18cc +dnet:publication_resource @=@ 0030 @=@ sound +dnet:publication_resource @=@ 0044 @=@ Graduate diploma +dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma +dnet:publication_resource @=@ 0000 @=@ UNKNOWN +dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance +dnet:languages @=@ abk @=@ ab +dnet:languages @=@ aar @=@ aa +dnet:languages @=@ afr @=@ af +dnet:languages @=@ alb/sqi @=@ sq +dnet:languages @=@ amh @=@ am +dnet:languages @=@ ara @=@ ar +dnet:languages @=@ arm/hye @=@ hy +dnet:languages @=@ asm @=@ as +dnet:languages @=@ ina @=@ ia +dnet:languages @=@ aym @=@ ay +dnet:languages @=@ aze @=@ az +dnet:languages @=@ bak @=@ ba +dnet:languages @=@ baq/eus @=@ eu +dnet:languages @=@ bel @=@ be +dnet:languages @=@ ben @=@ bn +dnet:languages @=@ bih @=@ bh +dnet:languages @=@ bis @=@ bi +dnet:languages @=@ bre @=@ br +dnet:languages @=@ bul @=@ bg +dnet:languages @=@ bur/mya @=@ my +dnet:languages @=@ cat @=@ ca +dnet:languages @=@ chi/zho @=@ zh +dnet:languages @=@ cos @=@ co +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ scr/hrv +dnet:languages @=@ ces/cze @=@ cs +dnet:languages @=@ dan @=@ da +dnet:languages @=@ dut/nld @=@ dut/nla +dnet:languages @=@ dut/nld @=@ dutdut +dnet:languages @=@ dut/nld @=@ nl +dnet:languages @=@ dut/nld @=@ nl_be +dnet:languages @=@ dut/nld @=@ nl_nl +dnet:languages @=@ dut/nld @=@ nld +dnet:languages @=@ dzo @=@ dz +dnet:languages @=@ eng @=@ en +dnet:languages @=@ eng @=@ en_au +dnet:languages @=@ eng @=@ en_en +dnet:languages @=@ eng @=@ en_gb +dnet:languages @=@ eng @=@ en_nz +dnet:languages @=@ eng @=@ en_us +dnet:languages @=@ eng @=@ english +dnet:languages @=@ eng @=@ en-us +dnet:languages @=@ eng @=@ en-US +dnet:languages @=@ eng @=@ English +dnet:languages @=@ eng @=@ EN +dnet:languages @=@ eng @=@ en angielski +dnet:languages @=@ eng @=@ en-GB +dnet:languages @=@ eng @=@ Englisch +dnet:languages @=@ epo @=@ eo +dnet:languages @=@ est @=@ et +dnet:languages @=@ fao @=@ fo +dnet:languages @=@ fij @=@ fj +dnet:languages @=@ fin @=@ fi +dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ fra/fre @=@ fr +dnet:languages @=@ fra/fre @=@ FR +dnet:languages @=@ fra/fre @=@ fr_be +dnet:languages @=@ fra/fre @=@ fr_fr +dnet:languages @=@ fra/fre @=@ fre/fra +dnet:languages @=@ fra/fre @=@ fra +dnet:languages @=@ fry @=@ fy +dnet:languages @=@ glg @=@ gl +dnet:languages @=@ geo/kat @=@ ka +dnet:languages @=@ deu/ger @=@ de +dnet:languages @=@ deu/ger @=@ ger/deu +dnet:languages @=@ deu/ger @=@ german +dnet:languages @=@ deu/ger @=@ ger +dnet:languages @=@ deu/ger @=@ deu +dnet:languages @=@ deu/ger @=@ DE-de +dnet:languages @=@ ell/gre @=@ el +dnet:languages @=@ ell/gre @=@ gr +dnet:languages @=@ ell/gre @=@ el-GR +dnet:languages @=@ kal @=@ kl +dnet:languages @=@ grn @=@ gn +dnet:languages @=@ guj @=@ gu +dnet:languages @=@ hau @=@ ha +dnet:languages @=@ heb @=@ he +dnet:languages @=@ hin @=@ hi +dnet:languages @=@ hun @=@ hu +dnet:languages @=@ ice/isl @=@ is +dnet:languages @=@ ine @=@ - +dnet:languages @=@ ind @=@ id +dnet:languages @=@ iku @=@ iu +dnet:languages @=@ ipk @=@ ik +dnet:languages @=@ gai/iri @=@ ga +dnet:languages @=@ gai/iri @=@ gle +dnet:languages @=@ ita @=@ it +dnet:languages @=@ jpn @=@ ja +dnet:languages @=@ jav @=@ jv +dnet:languages @=@ jav @=@ jv/jw +dnet:languages @=@ jav @=@ jw +dnet:languages @=@ kan @=@ kn +dnet:languages @=@ kas @=@ ks +dnet:languages @=@ kaz @=@ kk +dnet:languages @=@ khm @=@ km +dnet:languages @=@ kin @=@ rw +dnet:languages @=@ kir @=@ ky +dnet:languages @=@ kor @=@ ko +dnet:languages @=@ kur @=@ ku +dnet:languages @=@ lao @=@ lo +dnet:languages @=@ lat @=@ la +dnet:languages @=@ lav @=@ lv +dnet:languages @=@ lin @=@ ln +dnet:languages @=@ lit @=@ lt +dnet:languages @=@ mac/mak @=@ mk +dnet:languages @=@ mlg @=@ mg +dnet:languages @=@ may/msa @=@ ms +dnet:languages @=@ mlt @=@ ml +dnet:languages @=@ mao/mri @=@ mi +dnet:languages @=@ mar @=@ mr +dnet:languages @=@ mol @=@ mo +dnet:languages @=@ mon @=@ mn +dnet:languages @=@ nau @=@ na +dnet:languages @=@ nep @=@ ne +dnet:languages @=@ nor @=@ no +dnet:languages @=@ oci @=@ oc +dnet:languages @=@ ori @=@ or +dnet:languages @=@ orm @=@ om +dnet:languages @=@ pan @=@ pa +dnet:languages @=@ fas/per @=@ fa +dnet:languages @=@ pol @=@ pl +dnet:languages @=@ por @=@ pt +dnet:languages @=@ por @=@ pt_pt +dnet:languages @=@ pus @=@ ps +dnet:languages @=@ que @=@ qu +dnet:languages @=@ roh @=@ rm +dnet:languages @=@ ron/rum @=@ ro +dnet:languages @=@ run @=@ rn +dnet:languages @=@ rus @=@ ru +dnet:languages @=@ smo @=@ sm +dnet:languages @=@ sag @=@ sg +dnet:languages @=@ san @=@ sa +dnet:languages @=@ srp @=@ scc/srp +dnet:languages @=@ srp @=@ sr +dnet:languages @=@ scr @=@ sh +dnet:languages @=@ sna @=@ sn +dnet:languages @=@ snd @=@ sd +dnet:languages @=@ sin @=@ si +dnet:languages @=@ sit @=@ - +dnet:languages @=@ slk/slo @=@ sk +dnet:languages @=@ slv @=@ sl +dnet:languages @=@ som @=@ so +dnet:languages @=@ sot @=@ st +dnet:languages @=@ esl/spa @=@ es +dnet:languages @=@ sun @=@ su +dnet:languages @=@ swa @=@ sw +dnet:languages @=@ ssw @=@ ss +dnet:languages @=@ swe @=@ sv +dnet:languages @=@ swe @=@ sve/swe +dnet:languages @=@ tgl @=@ tl +dnet:languages @=@ tgk @=@ tg +dnet:languages @=@ tam @=@ ta +dnet:languages @=@ tat @=@ tt +dnet:languages @=@ tel @=@ te +dnet:languages @=@ tha @=@ th +dnet:languages @=@ tha @=@ thai +dnet:languages @=@ bod/tib @=@ bo +dnet:languages @=@ tir @=@ ti +dnet:languages @=@ tog @=@ to +dnet:languages @=@ tso @=@ ts +dnet:languages @=@ tsn @=@ tn +dnet:languages @=@ tur @=@ tr +dnet:languages @=@ tuk @=@ tk +dnet:languages @=@ twi @=@ tw +dnet:languages @=@ uig @=@ ug +dnet:languages @=@ ukr @=@ uk +dnet:languages @=@ und @=@ UNKNOWN +dnet:languages @=@ und @=@ none +dnet:languages @=@ urd @=@ ur +dnet:languages @=@ uzb @=@ uz +dnet:languages @=@ vie @=@ vi +dnet:languages @=@ vol @=@ vo +dnet:languages @=@ wln @=@ wa +dnet:languages @=@ cym/wel @=@ cy +dnet:languages @=@ wol @=@ wo +dnet:languages @=@ xho @=@ xh +dnet:languages @=@ yid @=@ yi +dnet:languages @=@ yor @=@ yo +dnet:languages @=@ zha @=@ za +dnet:languages @=@ zul @=@ zu +dnet:result_typologies @=@ dataset @=@ 0021 +dnet:result_typologies @=@ dataset @=@ 0024 +dnet:result_typologies @=@ dataset @=@ 0025 +dnet:result_typologies @=@ dataset @=@ 0030 +dnet:result_typologies @=@ dataset @=@ 0033 +dnet:result_typologies @=@ dataset @=@ 0037 +dnet:result_typologies @=@ dataset @=@ 0039 +dnet:result_typologies @=@ dataset @=@ 0046 +dnet:result_typologies @=@ other @=@ 0000 +dnet:result_typologies @=@ other @=@ 0010 +dnet:result_typologies @=@ other @=@ 0018 +dnet:result_typologies @=@ other @=@ 0020 +dnet:result_typologies @=@ other @=@ 0022 +dnet:result_typologies @=@ other @=@ 0023 +dnet:result_typologies @=@ other @=@ 0026 +dnet:result_typologies @=@ other @=@ 0027 +dnet:result_typologies @=@ other @=@ 0028 +dnet:result_typologies @=@ other @=@ 0042 +dnet:result_typologies @=@ publication @=@ 0001 +dnet:result_typologies @=@ publication @=@ 0002 +dnet:result_typologies @=@ publication @=@ 0004 +dnet:result_typologies @=@ publication @=@ 0005 +dnet:result_typologies @=@ publication @=@ 0006 +dnet:result_typologies @=@ publication @=@ 0007 +dnet:result_typologies @=@ publication @=@ 0008 +dnet:result_typologies @=@ publication @=@ 0009 +dnet:result_typologies @=@ publication @=@ 0011 +dnet:result_typologies @=@ publication @=@ 0012 +dnet:result_typologies @=@ publication @=@ 0013 +dnet:result_typologies @=@ publication @=@ 0014 +dnet:result_typologies @=@ publication @=@ 0015 +dnet:result_typologies @=@ publication @=@ 0016 +dnet:result_typologies @=@ publication @=@ 0017 +dnet:result_typologies @=@ publication @=@ 0019 +dnet:result_typologies @=@ publication @=@ 0031 +dnet:result_typologies @=@ publication @=@ 0032 +dnet:result_typologies @=@ publication @=@ 0034 +dnet:result_typologies @=@ publication @=@ 0035 +dnet:result_typologies @=@ publication @=@ 0036 +dnet:result_typologies @=@ publication @=@ 0038 +dnet:result_typologies @=@ publication @=@ 0044 +dnet:result_typologies @=@ publication @=@ 0045 +dnet:result_typologies @=@ software @=@ 0029 +dnet:result_typologies @=@ software @=@ 0040 +dnet:countries @=@ AF @=@ AFG +dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ AR @=@ ARG +dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ AU @=@ AUS +dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ AT @=@ AUT +dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ AZ @=@ AZE +dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ BE @=@ BEL +dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ BJ @=@ BEN +dnet:countries @=@ BO @=@ Bolivia, Plurinational State of +dnet:countries @=@ BA @=@ BIH +dnet:countries @=@ BA @=@ Bosnia-Hercegovina +dnet:countries @=@ BR @=@ BRA +dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ BF @=@ BFA +dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ KH @=@ Cambogia +dnet:countries @=@ KH @=@ Campuchea +dnet:countries @=@ CM @=@ CMR +dnet:countries @=@ CA @=@ CAN +dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ CL @=@ CHL +dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ CN @=@ CHN +dnet:countries @=@ CN @=@ China +dnet:countries @=@ CO @=@ COL +dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ CD @=@ Congo +dnet:countries @=@ CD @=@ Congo Democratic Republic (formerly Zaire) +dnet:countries @=@ CD @=@ Congo, Republic +dnet:countries @=@ CD @=@ Congo, the Democratic Republic of the +dnet:countries @=@ CD @=@ Zaire +dnet:countries @=@ CR @=@ CRI +dnet:countries @=@ CI @=@ CIV +dnet:countries @=@ CI @=@ Ivory Coast +dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ HR @=@ HRV +dnet:countries @=@ CY @=@ CYP +dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ CZ @=@ CZE +dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ CZ @=@ Czechia +dnet:countries @=@ CZ @=@ Czechoslovakia +dnet:countries @=@ DK @=@ DNK +dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ EG @=@ EGY +dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ SV @=@ SLV +dnet:countries @=@ EE @=@ EST +dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ ET @=@ ETH +dnet:countries @=@ EU @=@ EEC +dnet:countries @=@ FJ @=@ FJI +dnet:countries @=@ FI @=@ FIN +dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ MK @=@ Macedonia +dnet:countries @=@ MK @=@ Macedonia, the Former Yugoslav Republic Of +dnet:countries @=@ MK @=@ North Macedonia +dnet:countries @=@ FR @=@ FRA +dnet:countries @=@ FR @=@ France +dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ PF @=@ PYF +dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ DE @=@ DEU +dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ DE @=@ Germany, Berlin +dnet:countries @=@ GH @=@ GHA +dnet:countries @=@ GR @=@ EL +dnet:countries @=@ GR @=@ GRC +dnet:countries @=@ GL @=@ GRL +dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ VA @=@ Vatican State +dnet:countries @=@ HK @=@ HKG +dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ HK @=@ Hongkong +dnet:countries @=@ HU @=@ HUN +dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ IS @=@ ISL +dnet:countries @=@ IN @=@ IND +dnet:countries @=@ IN @=@ India +dnet:countries @=@ ID @=@ IDN +dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ IR @=@ Iran +dnet:countries @=@ IR @=@ Iran, Islamic Republic of +dnet:countries @=@ IE @=@ IRL +dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ IL @=@ ISR +dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ IT @=@ ITA +dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ JP @=@ JPN +dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ KZ @=@ KAZ +dnet:countries @=@ KZ @=@ Kazakistan +dnet:countries @=@ KZ @=@ Kazakstan +dnet:countries @=@ KE @=@ KEN +dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ KR @=@ KOR +dnet:countries @=@ KR @=@ Korea, Republic of +dnet:countries @=@ KR @=@ Korean Republic (South Korea) +dnet:countries @=@ KP @=@ PRK +dnet:countries @=@ LV @=@ LVA +dnet:countries @=@ LY @=@ Libya +dnet:countries @=@ LT @=@ LTU +dnet:countries @=@ LU @=@ LUX +dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ MX @=@ MEX +dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ FM @=@ Micronesia +dnet:countries @=@ MD @=@ Moldova +dnet:countries @=@ MD @=@ Moldova, Republic of +dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ NA @=@ NAM +dnet:countries @=@ NL @=@ NLD +dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ NC @=@ NCL +dnet:countries @=@ NZ @=@ NZL +dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ NO @=@ NOR +dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ OC @=@ Australasia +dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ PK @=@ PAK +dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ PS @=@ Palestin, State of +dnet:countries @=@ PS @=@ Palestine, State of +dnet:countries @=@ PS @=@ Palestinian Territory, Occupied +dnet:countries @=@ PA @=@ PAN +dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ PG @=@ PapuaNew Guinea +dnet:countries @=@ PE @=@ PER +dnet:countries @=@ PH @=@ PHL +dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ PL @=@ POL +dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ PT @=@ PRT +dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ RO @=@ ROU +dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ RU @=@ RUS +dnet:countries @=@ RU @=@ Russia +dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ KN @=@ Saint Kitts And Nevis +dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ SN @=@ SEN +dnet:countries @=@ RS @=@ SRB +dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ SG @=@ SGP +dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ SK @=@ SVK +dnet:countries @=@ SI @=@ SVN +dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ ZA @=@ ZAF +dnet:countries @=@ ES @=@ ESP +dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ LK @=@ LKA +dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ SD @=@ SDN +dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ SE @=@ SWE +dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ CH @=@ CHE +dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ SY @=@ Syria +dnet:countries @=@ ST @=@ Sao Tome and Principe +dnet:countries @=@ TW @=@ TWN +dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ TW @=@ Taiwan, Province of China +dnet:countries @=@ TZ @=@ Tanzania +dnet:countries @=@ TZ @=@ Tanzania, United Republic of +dnet:countries @=@ TH @=@ THA +dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ TL @=@ East Timor +dnet:countries @=@ TN @=@ TUN +dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ TR @=@ TUR +dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ UNKNOWN @=@ AAA +dnet:countries @=@ UNKNOWN @=@ [Unknown] +dnet:countries @=@ UNKNOWN @=@ _? +dnet:countries @=@ UA @=@ UKR +dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ GB @=@ England +dnet:countries @=@ GB @=@ GBR +dnet:countries @=@ GB @=@ Great Britain +dnet:countries @=@ GB @=@ Great Britain and Northern Ireland +dnet:countries @=@ GB @=@ Scotland +dnet:countries @=@ GB @=@ UK +dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ US @=@ USA +dnet:countries @=@ US @=@ United States +dnet:countries @=@ US @=@ United States of America +dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ VE @=@ Venezuela, Bolivarian Republic of +dnet:countries @=@ VN @=@ Vietnam +dnet:countries @=@ VG @=@ British Virgin Islands +dnet:countries @=@ YU @=@ Jugoslavia +dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ ZW @=@ ABW +dnet:protocols @=@ oai @=@ OAI-PMH +dnet:protocols @=@ oai @=@ OAI_PMH +dnet:pid_types @=@ orcid @=@ ORCID12 +dnet:pid_types @=@ handle @=@ hdl +dnet:review_levels @=@ 0000 @=@ UNKNOWN +dnet:review_levels @=@ 0002 @=@ 80 大阪経大学会「Working Paper」 +dnet:review_levels @=@ 0002 @=@ AO +dnet:review_levels @=@ 0002 @=@ ARTICLE SANS COMITE DE LECTURE (ASCL) +dnet:review_levels @=@ 0002 @=@ Arbeitspapier +dnet:review_levels @=@ 0002 @=@ Arbeitspapier [workingPaper] +dnet:review_levels @=@ 0002 @=@ Article (author) +dnet:review_levels @=@ 0002 @=@ Article type: preprint +dnet:review_levels @=@ 0002 @=@ Article(author version) +dnet:review_levels @=@ 0002 @=@ Article, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Articulo no evaluado +dnet:review_levels @=@ 0002 @=@ Artigo Solicitado e Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pres +dnet:review_levels @=@ 0002 @=@ Artikkeli|Artikkeli ammattilehdessä. Ei vertaisarvioitu +dnet:review_levels @=@ 0002 @=@ Artículo no evaluado +dnet:review_levels @=@ 0002 @=@ Book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Book Part (author) +dnet:review_levels @=@ 0002 @=@ Book item; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Conference preprint +dnet:review_levels @=@ 0002 @=@ Contribution to book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Discussion Paper +dnet:review_levels @=@ 0002 @=@ Document de travail (Working Paper) +dnet:review_levels @=@ 0002 @=@ Documento de trabajo +dnet:review_levels @=@ 0002 @=@ Documento de trabajo de investigaci??n +dnet:review_levels @=@ 0002 @=@ Draft +dnet:review_levels @=@ 0002 @=@ E-pub ahead of print +dnet:review_levels @=@ 0002 @=@ Editorial de revista, no evaluado por pares +dnet:review_levels @=@ 0002 @=@ Editorial de revista, não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Editorial não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Editors (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Epub ahead of print +dnet:review_levels @=@ 0002 @=@ Hakemlik Sürecinden Geçmiş Makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecindeki makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecinden geçmemiş kitap değerlendirmesi +dnet:review_levels @=@ 0002 @=@ Journal Article (author version) +dnet:review_levels @=@ 0002 @=@ Journal Article Preprint +dnet:review_levels @=@ 0002 @=@ Journal Editorial, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal article; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal:WorkingPaper +dnet:review_levels @=@ 0002 @=@ Manuscript (preprint) +dnet:review_levels @=@ 0002 @=@ Monográfico (Informes, Documentos de trabajo, etc.) +dnet:review_levels @=@ 0002 @=@ NOTE INTERNE OU DE TRAVAIL +dnet:review_levels @=@ 0002 @=@ Nicht begutachteter Beitrag +dnet:review_levels @=@ 0002 @=@ No evaluado por pares +dnet:review_levels @=@ 0002 @=@ Non-Refereed +dnet:review_levels @=@ 0002 @=@ Non-refeered article +dnet:review_levels @=@ 0002 @=@ Non-refereed Article +dnet:review_levels @=@ 0002 @=@ Non-refereed Book Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Text +dnet:review_levels @=@ 0002 @=@ NonPeerReviewed +dnet:review_levels @=@ 0002 @=@ Not Peer reviewed +dnet:review_levels @=@ 0002 @=@ Not Reviewed +dnet:review_levels @=@ 0002 @=@ Not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Não avaliada pelos pares +dnet:review_levels @=@ 0002 @=@ Não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Original article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Other publication (non peer-review) +dnet:review_levels @=@ 0002 @=@ Pre Print +dnet:review_levels @=@ 0002 @=@ Pre-print +dnet:review_levels @=@ 0002 @=@ Preprint Article +dnet:review_levels @=@ 0002 @=@ Preprints +dnet:review_levels @=@ 0002 @=@ Preprints, Working Papers, ... +dnet:review_levels @=@ 0002 @=@ Rapporto tecnico / Working Paper / Rapporto di progetto +dnet:review_levels @=@ 0002 @=@ Resumo Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Review article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ SMUR +dnet:review_levels @=@ 0002 @=@ Submissão dos artigos +dnet:review_levels @=@ 0002 @=@ Submitted version +dnet:review_levels @=@ 0002 @=@ Vertaisarvioimaton kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0002 @=@ Vorabdruck +dnet:review_levels @=@ 0002 @=@ Wetensch. publ. non-refereed +dnet:review_levels @=@ 0002 @=@ Working / discussion paper +dnet:review_levels @=@ 0002 @=@ Working Document +dnet:review_levels @=@ 0002 @=@ Working Notes +dnet:review_levels @=@ 0002 @=@ Working Paper +dnet:review_levels @=@ 0002 @=@ Working Paper / Technical Report +dnet:review_levels @=@ 0002 @=@ Working Papers +dnet:review_levels @=@ 0002 @=@ WorkingPaper +dnet:review_levels @=@ 0002 @=@ article in non peer-reviewed journal +dnet:review_levels @=@ 0002 @=@ articolo preliminare +dnet:review_levels @=@ 0002 @=@ articulo preliminar +dnet:review_levels @=@ 0002 @=@ articulo sin revision por pares +dnet:review_levels @=@ 0002 @=@ artigo preliminar +dnet:review_levels @=@ 0002 @=@ artigo sem revisão +dnet:review_levels @=@ 0002 @=@ artículo preliminar +dnet:review_levels @=@ 0002 @=@ artículo sin revisión por pares +dnet:review_levels @=@ 0002 @=@ bookchapter (author version) +dnet:review_levels @=@ 0002 @=@ borrador +dnet:review_levels @=@ 0002 @=@ column (author version) +dnet:review_levels @=@ 0002 @=@ communication_invitee +dnet:review_levels @=@ 0002 @=@ doc-type:preprint +dnet:review_levels @=@ 0002 @=@ doc-type:workingPaper +dnet:review_levels @=@ 0002 @=@ draf +dnet:review_levels @=@ 0002 @=@ eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_816b +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_71e4c1898caa6e32 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_b1a7d7d4d402bcce +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedBookItem +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedJournalArticle +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/Preprint +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/WorkingPaper +dnet:review_levels @=@ 0002 @=@ https://dictionary.casrai.org/Preprint +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documento de trabajo +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documentoDeTrabajo +dnet:review_levels @=@ 0002 @=@ info:eu repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/preprint +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submitedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/unReviewed +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/updatedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ info:eu-repo/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/articleNonPeerReview +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/openurl/vlink-workingpaper +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ non peer-reviewed article +dnet:review_levels @=@ 0002 @=@ non-refereed review article +dnet:review_levels @=@ 0002 @=@ não avaliado +dnet:review_levels @=@ 0002 @=@ preprint +dnet:review_levels @=@ 0002 @=@ prepublicación +dnet:review_levels @=@ 0002 @=@ proceeding, seminar, workshop without peer review +dnet:review_levels @=@ 0002 @=@ proceedings (author version) +dnet:review_levels @=@ 0002 @=@ pré-print +dnet:review_levels @=@ 0002 @=@ pré-publication +dnet:review_levels @=@ 0002 @=@ préprint +dnet:review_levels @=@ 0002 @=@ prépublication +dnet:review_levels @=@ 0002 @=@ publicació preliminar +dnet:review_levels @=@ 0002 @=@ publication-preprint +dnet:review_levels @=@ 0002 @=@ publication-workingpaper +dnet:review_levels @=@ 0002 @=@ submitedVersion +dnet:review_levels @=@ 0002 @=@ submittedVersion +dnet:review_levels @=@ 0002 @=@ voordruk +dnet:review_levels @=@ 0002 @=@ workingPaper +dnet:review_levels @=@ 0002 @=@ ön baskı +dnet:review_levels @=@ 0002 @=@ Препринт +dnet:review_levels @=@ 0002 @=@ предпечатная версия публикации +dnet:review_levels @=@ 0002 @=@ препринт статьи +dnet:review_levels @=@ 0002 @=@ ディスカッション/ワーキング・ペーパー DP/WP +dnet:review_levels @=@ 0002 @=@ プレプリント +dnet:review_levels @=@ 0002 @=@ プレプリント Preprint +dnet:review_levels @=@ 0002 @=@ プレプリント(Preprint) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-その他(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-テクニカルレポート類(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-会議発表論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-図書(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-学術雑誌論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-紀要論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-雑誌記事(査読無し) +dnet:review_levels @=@ 0002 @=@ 预印本 +dnet:review_levels @=@ 0001 @=@ ##rt.metadata.pkp.peerReviewed## +dnet:review_levels @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Art?culo revisado por pares +dnet:review_levels @=@ 0001 @=@ Article revisat per persones expertes +dnet:review_levels @=@ 0001 @=@ Article type: peer review +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par des pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Articolo valutato secondo i criteri della peer review +dnet:review_levels @=@ 0001 @=@ Articulo evaluado por dos pares +dnet:review_levels @=@ 0001 @=@ Articulo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artigo Avaliado pelos Pares +dnet:review_levels @=@ 0001 @=@ Artigo Revisto por Pares +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por blind peer review +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por pares +dnet:review_levels @=@ 0001 @=@ Artigo de convidado. Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artigos; Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artículo de investigación, Investigaciones originales, Artículo evaluado por pares, Investigaciones empíricas +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Ensayos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Investigaciones empíricas, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo revisado +dnet:review_levels @=@ 0001 @=@ Artículo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artículos de estudiantes, Artículo evaluado por pares, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículos de investigación evaluados por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluadores por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares académicos +dnet:review_levels @=@ 0001 @=@ Artículos revisados por pares +dnet:review_levels @=@ 0001 @=@ Avaliadas pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado anonimamente por pares +dnet:review_levels @=@ 0001 @=@ Avaliado em duplo cego por pares +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria. Avaliado pelos pares. +dnet:review_levels @=@ 0001 @=@ Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pelo pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos Editores +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigo de convidado +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais, Artigos de Revisão +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares. Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado po Pares +dnet:review_levels @=@ 0001 @=@ Avaliado por Editor +dnet:review_levels @=@ 0001 @=@ Avaliado por pares +dnet:review_levels @=@ 0001 @=@ Avaliados pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliados por Pares +dnet:review_levels @=@ 0001 @=@ Blind Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Comentario de libros, Comentario de revistas, Comentario de conferencias, Artículo evaluado por pares, Artículo de investigación +dnet:review_levels @=@ 0001 @=@ Conference paper; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Contribution to book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Documento Avaliado por Pares +dnet:review_levels @=@ 0001 @=@ Double blind evaluation articles +dnet:review_levels @=@ 0001 @=@ Double blind peer review +dnet:review_levels @=@ 0001 @=@ Editors (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Evaluación por pares +dnet:review_levels @=@ 0001 @=@ Evaluado por pares +dnet:review_levels @=@ 0001 @=@ Evaluados por los pares +dnet:review_levels @=@ 0001 @=@ Hakem sürecinden geçmiş makale +dnet:review_levels @=@ 0001 @=@ Hakemli makale +dnet:review_levels @=@ 0001 @=@ Hakemlik Sürecinden Geçmiş +dnet:review_levels @=@ 0001 @=@ Invited Peer-Reviewed Article +dnet:review_levels @=@ 0001 @=@ Journal article; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Original article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Other publication (peer-review) +dnet:review_levels @=@ 0001 @=@ Paper peer-reviewed +dnet:review_levels @=@ 0001 @=@ Papers evaluated by academic peers +dnet:review_levels @=@ 0001 @=@ Peer reviewed +dnet:review_levels @=@ 0001 @=@ Peer reviewed article +dnet:review_levels @=@ 0001 @=@ Peer reviewed invited commentry +dnet:review_levels @=@ 0001 @=@ Peer-Reviewed Protocol +dnet:review_levels @=@ 0001 @=@ Peer-reviewd Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Paper +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Text +dnet:review_levels @=@ 0001 @=@ Peer-reviewed communication +dnet:review_levels @=@ 0001 @=@ Peer-reviewed conference proceedings +dnet:review_levels @=@ 0001 @=@ Peer-reviewed research article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed short communication +dnet:review_levels @=@ 0001 @=@ PeerReviewed +dnet:review_levels @=@ 0001 @=@ Proceedings (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Refereed +dnet:review_levels @=@ 0001 @=@ Refereed Article +dnet:review_levels @=@ 0001 @=@ Research articles evaluated by double blind +dnet:review_levels @=@ 0001 @=@ Resenha avaliada pelos pares +dnet:review_levels @=@ 0001 @=@ Review article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Reviewed by peers +dnet:review_levels @=@ 0001 @=@ Revisión por Expertos +dnet:review_levels @=@ 0001 @=@ Revisto por Pares +dnet:review_levels @=@ 0001 @=@ SBBq abstracts / peer-reviewed +dnet:review_levels @=@ 0001 @=@ SBBq resúmenes - revisada por pares +dnet:review_levels @=@ 0001 @=@ Scholarly publ. Refereed +dnet:review_levels @=@ 0001 @=@ Scientific Publ (refereed) +dnet:review_levels @=@ 0001 @=@ Vertaisarvioimaton kirjoitus tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli konferenssijulkaisussa +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0001 @=@ Wetensch. publ. Refereed +dnet:review_levels @=@ 0001 @=@ article in peer-reviewed journal +dnet:review_levels @=@ 0001 @=@ articles validés +dnet:review_levels @=@ 0001 @=@ avaliado por pares, temas livres +dnet:review_levels @=@ 0001 @=@ info:eu-repo/semantics/peerReviewed +dnet:review_levels @=@ 0001 @=@ info:ulb-repo/semantics/articlePeerReview +dnet:review_levels @=@ 0001 @=@ proceeding with peer review +dnet:review_levels @=@ 0001 @=@ refereed_publications +dnet:review_levels @=@ 0001 @=@ ul_published_reviewed +dnet:review_levels @=@ 0001 @=@ Άρθρο που έχει αξιολογηθεί από ομότιμους ειδικούς +dnet:review_levels @=@ 0001 @=@ Άρθρο το οποίο έχει περάσει από ομότιμη αξιολόγηση +dnet:review_levels @=@ 0001 @=@ レフェリー付き論文 +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-テクニカルレポート類(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-会議発表論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-図書(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り) +dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 査読論文 \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt new file mode 100644 index 000000000..93cc00eca --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/terms.txt @@ -0,0 +1,1080 @@ +ModularUiLabels @=@ ModularUiLabels @=@ PendingRepositoryResources @=@ Pending datasource +ModularUiLabels @=@ ModularUiLabels @=@ RepositoryServiceResources @=@ Valid datasource +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::EuropePMC @=@ file::EuropePMC +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::PDF @=@ file::PDF +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::WoS @=@ file::WoS +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ metadata @=@ metadata +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::hybrid @=@ file::hybrid +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:cris @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset:orcidworks-no-doi @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:infospace @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:aggregator @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:datasetarchive @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:entityregistry @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:repository @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:aggregator @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:subject @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:zenodocommunity @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ iis @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:entityregistry @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:organization @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:infospace @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:dedup @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:datasource @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ propagation:project:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:cris @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:repository @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:datasetarchive @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:pid @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:insert @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:search @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ UNKNOWN @=@ UNKNOWN +dnet:provenanceActions @=@ dnet:provenanceActions @=@ country:instrepos @=@ Inferred by OpenAIRE +dnet:access_modes @=@ dnet:access_modes @=@ 12MONTHS @=@ 12 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ 6MONTHS @=@ 6 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ CLOSED @=@ Closed Access +dnet:access_modes @=@ dnet:access_modes @=@ EMBARGO @=@ Embargo +dnet:access_modes @=@ dnet:access_modes @=@ OPEN @=@ Open Access +dnet:access_modes @=@ dnet:access_modes @=@ OPEN SOURCE @=@ Open Source +dnet:access_modes @=@ dnet:access_modes @=@ OTHER @=@ Other +dnet:access_modes @=@ dnet:access_modes @=@ RESTRICTED @=@ Restricted +dnet:access_modes @=@ dnet:access_modes @=@ UNKNOWN @=@ not available +fct:funding_typologies @=@ fct:funding_typologies @=@ fct:program @=@ fct:program +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ OpenAIRE PubRepos v4.0 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ files @=@ files +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ native @=@ native +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ UNKNOWN @=@ not available +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ notCompatible @=@ under validation +dnet:dataCite_date @=@ dnet:dataCite_date @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_date @=@ dnet:dataCite_date @=@ available @=@ available +dnet:dataCite_date @=@ dnet:dataCite_date @=@ copyrighted @=@ copyrighted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ created @=@ created +dnet:dataCite_date @=@ dnet:dataCite_date @=@ endDate @=@ endDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ issued @=@ issued +dnet:dataCite_date @=@ dnet:dataCite_date @=@ startDate @=@ startDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ submitted @=@ submitted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ updated @=@ updated +dnet:dataCite_date @=@ dnet:dataCite_date @=@ valid @=@ valid +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-print @=@ published-print +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-online @=@ published-online +dnet:dataCite_date @=@ dnet:dataCite_date @=@ accepted @=@ accepted +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ crissystem @=@ CRIS System +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ datarepository::unknown @=@ Data Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::datarepository @=@ Data Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::projects @=@ Funder database +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ infospace @=@ Information Space +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::institutional @=@ Institutional Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::institutional @=@ Institutional Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::journal @=@ Journal +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::journals @=@ Journal Aggregator/Publisher +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::mock @=@ Other +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubscatalogue::unknown @=@ Publication Catalogue +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::unknown @=@ Publication Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::unknown @=@ Publication Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry @=@ Registry +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::repositories @=@ Registry of repositories +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::products @=@ Registry of research products +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::researchers @=@ Registry of researchers +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::organizations @=@ Registry of organizations +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ scholarcomminfra @=@ Scholarly Comm. Infrastructure +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ softwarerepository @=@ Software Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::thematic @=@ Thematic Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ websource @=@ Web Source +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::softwarerepository @=@ Software Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ orprepository @=@ Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ researchgraph @=@ Research Graph +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ACM @=@ ACM Computing Classification System +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ agrovoc @=@ AGROVOC +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bicssc @=@ BIC standard subject categories +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ DFG @=@ DFG Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ddc @=@ Dewey Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ nsf:fieldOfApplication @=@ Field of Application (NSF) +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ gok @=@ Göttingen Online Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ec:h2020topics @=@ Horizon 2020 Topics +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ IPC @=@ International Patent Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ jel @=@ JEL Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ lcsh @=@ Library of Congress Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ msc @=@ Mathematics Subject Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesheuropmc @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesh @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bk @=@ Nederlandse basisclassificatie +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ dnet:od_subjects @=@ OpenDOAR subjects +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ocis @=@ Optics Classification and Indexing Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ pacs @=@ Physics and Astronomy Classification Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ rvk @=@ Regensburger Verbundklassifikation +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ udc @=@ Universal Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ wos @=@ Web of Science Subject Areas +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ arxiv @=@ arXiv +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ keyword @=@ keyword +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ MAG @=@ Microsoft Academic Graph classification +fct:contractTypes @=@ fct:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:publication_resource @=@ dnet:publication_resource @=@ 0018 @=@ Annotation +dnet:publication_resource @=@ dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ dnet:publication_resource @=@ 0033 @=@ Audiovisual +dnet:publication_resource @=@ dnet:publication_resource @=@ 0008 @=@ Bachelor thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0046 @=@ Bioentity +dnet:publication_resource @=@ dnet:publication_resource @=@ 0002 @=@ Book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0037 @=@ Clinical Trial +dnet:publication_resource @=@ dnet:publication_resource @=@ 0022 @=@ Collection +dnet:publication_resource @=@ dnet:publication_resource @=@ 0004 @=@ Conference object +dnet:publication_resource @=@ dnet:publication_resource @=@ 0005 @=@ Contribution for newspaper or weekly magazine +dnet:publication_resource @=@ dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ dnet:publication_resource @=@ 0031 @=@ Data Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0021 @=@ Dataset +dnet:publication_resource @=@ dnet:publication_resource @=@ 0006 @=@ Doctoral thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0023 @=@ Event +dnet:publication_resource @=@ dnet:publication_resource @=@ 0009 @=@ External research report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0024 @=@ Film +dnet:publication_resource @=@ dnet:publication_resource @=@ 0025 @=@ Image +dnet:publication_resource @=@ dnet:publication_resource @=@ 0026 @=@ InteractiveResource +dnet:publication_resource @=@ dnet:publication_resource @=@ 0011 @=@ Internal report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0043 @=@ Journal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0010 @=@ Lecture +dnet:publication_resource @=@ dnet:publication_resource @=@ 0007 @=@ Master thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0027 @=@ Model +dnet:publication_resource @=@ dnet:publication_resource @=@ 0012 @=@ Newsletter +dnet:publication_resource @=@ dnet:publication_resource @=@ 0020 @=@ Other ORP type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0039 @=@ Other dataset type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0038 @=@ Other literature type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0040 @=@ Other software type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0013 @=@ Part of book or chapter of book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ dnet:publication_resource @=@ 0028 @=@ PhysicalObject +dnet:publication_resource @=@ dnet:publication_resource @=@ 0016 @=@ Preprint +dnet:publication_resource @=@ dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ dnet:publication_resource @=@ 0036 @=@ Project proposal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0017 @=@ Report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0014 @=@ Research +dnet:publication_resource @=@ dnet:publication_resource @=@ 0015 @=@ Review +dnet:publication_resource @=@ dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ dnet:publication_resource @=@ 0032 @=@ Software Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound +dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown +dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ 171 @=@ Article 171 of the Treaty +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ BSG @=@ Research for the benefit of specific groups +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CIP-EIP-TN @=@ CIP-Eco-Innovation - CIP-Thematic Network +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP @=@ Collaborative project +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP-CSA @=@ Combination of CP & CSA +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CSA @=@ Coordination and support action +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ ERC @=@ Support for frontier research (ERC) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ MC @=@ Support for training and career development of researchers (Marie Curie) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ NoE @=@ Network of Excellence +wt:funding_relations @=@ wt:funding_relations @=@ wt:hasParentFunding @=@ wt:hasParentFunding +dnet:languages @=@ dnet:languages @=@ abk @=@ Abkhazian +dnet:languages @=@ dnet:languages @=@ ace @=@ Achinese +dnet:languages @=@ dnet:languages @=@ ach @=@ Acoli +dnet:languages @=@ dnet:languages @=@ ada @=@ Adangme +dnet:languages @=@ dnet:languages @=@ aar @=@ Afar +dnet:languages @=@ dnet:languages @=@ afh @=@ Afrihili +dnet:languages @=@ dnet:languages @=@ afr @=@ Afrikaans +dnet:languages @=@ dnet:languages @=@ afa @=@ Afro-Asiatic +dnet:languages @=@ dnet:languages @=@ aka @=@ Akan +dnet:languages @=@ dnet:languages @=@ akk @=@ Akkadian +dnet:languages @=@ dnet:languages @=@ alb/sqi @=@ Albanian +dnet:languages @=@ dnet:languages @=@ ale @=@ Aleut +dnet:languages @=@ dnet:languages @=@ alg @=@ Algonquian languages +dnet:languages @=@ dnet:languages @=@ tut @=@ Altaic +dnet:languages @=@ dnet:languages @=@ amh @=@ Amharic +dnet:languages @=@ dnet:languages @=@ egy @=@ Ancient Egyptian +dnet:languages @=@ dnet:languages @=@ grc @=@ Ancient Greek +dnet:languages @=@ dnet:languages @=@ apa @=@ Apache +dnet:languages @=@ dnet:languages @=@ ara @=@ Arabic +dnet:languages @=@ dnet:languages @=@ arg @=@ Aragonese +dnet:languages @=@ dnet:languages @=@ arc @=@ Aramaic +dnet:languages @=@ dnet:languages @=@ arp @=@ Arapaho +dnet:languages @=@ dnet:languages @=@ arn @=@ Araucanian +dnet:languages @=@ dnet:languages @=@ arw @=@ Arawak +dnet:languages @=@ dnet:languages @=@ arm/hye @=@ Armenian +dnet:languages @=@ dnet:languages @=@ art @=@ Artificial +dnet:languages @=@ dnet:languages @=@ asm @=@ Assamese +dnet:languages @=@ dnet:languages @=@ ath @=@ Athapascan +dnet:languages @=@ dnet:languages @=@ map @=@ Austronesian +dnet:languages @=@ dnet:languages @=@ ina @=@ Auxiliary Language Association) +dnet:languages @=@ dnet:languages @=@ ava @=@ Avaric +dnet:languages @=@ dnet:languages @=@ ave @=@ Avestan +dnet:languages @=@ dnet:languages @=@ awa @=@ Awadhi +dnet:languages @=@ dnet:languages @=@ aym @=@ Aymara +dnet:languages @=@ dnet:languages @=@ aze @=@ Azerbaijani +dnet:languages @=@ dnet:languages @=@ nah @=@ Aztec +dnet:languages @=@ dnet:languages @=@ ban @=@ Balinese +dnet:languages @=@ dnet:languages @=@ bat @=@ Baltic +dnet:languages @=@ dnet:languages @=@ bal @=@ Baluchi +dnet:languages @=@ dnet:languages @=@ bam @=@ Bambara +dnet:languages @=@ dnet:languages @=@ bai @=@ Bamileke +dnet:languages @=@ dnet:languages @=@ bad @=@ Banda +dnet:languages @=@ dnet:languages @=@ bnt @=@ Bantu +dnet:languages @=@ dnet:languages @=@ bas @=@ Basa +dnet:languages @=@ dnet:languages @=@ bak @=@ Bashkir +dnet:languages @=@ dnet:languages @=@ baq/eus @=@ Basque +dnet:languages @=@ dnet:languages @=@ bej @=@ Beja +dnet:languages @=@ dnet:languages @=@ bel @=@ Belarusian +dnet:languages @=@ dnet:languages @=@ bem @=@ Bemba +dnet:languages @=@ dnet:languages @=@ ben @=@ Bengali +dnet:languages @=@ dnet:languages @=@ ber @=@ Berber +dnet:languages @=@ dnet:languages @=@ bho @=@ Bhojpuri +dnet:languages @=@ dnet:languages @=@ bih @=@ Bihari +dnet:languages @=@ dnet:languages @=@ bik @=@ Bikol +dnet:languages @=@ dnet:languages @=@ bin @=@ Bini +dnet:languages @=@ dnet:languages @=@ bis @=@ Bislama +dnet:languages @=@ dnet:languages @=@ nob @=@ Bokmål, Norwegian; Norwegian Bokmål +dnet:languages @=@ dnet:languages @=@ bos @=@ Bosnian +dnet:languages @=@ dnet:languages @=@ bra @=@ Braj +dnet:languages @=@ dnet:languages @=@ bre @=@ Breton +dnet:languages @=@ dnet:languages @=@ bug @=@ Buginese +dnet:languages @=@ dnet:languages @=@ bul @=@ Bulgarian +dnet:languages @=@ dnet:languages @=@ bua @=@ Buriat +dnet:languages @=@ dnet:languages @=@ bur/mya @=@ Burmese +dnet:languages @=@ dnet:languages @=@ cad @=@ Caddo +dnet:languages @=@ dnet:languages @=@ car @=@ Carib +dnet:languages @=@ dnet:languages @=@ cat @=@ Catalan; Valencian +dnet:languages @=@ dnet:languages @=@ cau @=@ Caucasian +dnet:languages @=@ dnet:languages @=@ ceb @=@ Cebuano +dnet:languages @=@ dnet:languages @=@ cel @=@ Celtic +dnet:languages @=@ dnet:languages @=@ cai @=@ Central American Indian +dnet:languages @=@ dnet:languages @=@ chg @=@ Chagatai +dnet:languages @=@ dnet:languages @=@ cha @=@ Chamorro +dnet:languages @=@ dnet:languages @=@ che @=@ Chechen +dnet:languages @=@ dnet:languages @=@ chr @=@ Cherokee +dnet:languages @=@ dnet:languages @=@ nya @=@ Chewa; Chichewa; Nyanja +dnet:languages @=@ dnet:languages @=@ chy @=@ Cheyenne +dnet:languages @=@ dnet:languages @=@ chb @=@ Chibcha +dnet:languages @=@ dnet:languages @=@ chi/zho @=@ Chinese +dnet:languages @=@ dnet:languages @=@ chn @=@ Chinook jargon +dnet:languages @=@ dnet:languages @=@ cho @=@ Choctaw +dnet:languages @=@ dnet:languages @=@ chu @=@ Church Slavic; Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic +dnet:languages @=@ dnet:languages @=@ chv @=@ Chuvash +dnet:languages @=@ dnet:languages @=@ cop @=@ Coptic +dnet:languages @=@ dnet:languages @=@ cor @=@ Cornish +dnet:languages @=@ dnet:languages @=@ cos @=@ Corsican +dnet:languages @=@ dnet:languages @=@ cre @=@ Cree +dnet:languages @=@ dnet:languages @=@ mus @=@ Creek +dnet:languages @=@ dnet:languages @=@ crp @=@ Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ hrv @=@ Croatian +dnet:languages @=@ dnet:languages @=@ cus @=@ Cushitic +dnet:languages @=@ dnet:languages @=@ ces/cze @=@ Czech +dnet:languages @=@ dnet:languages @=@ dak @=@ Dakota +dnet:languages @=@ dnet:languages @=@ dan @=@ Danish +dnet:languages @=@ dnet:languages @=@ del @=@ Delaware +dnet:languages @=@ dnet:languages @=@ din @=@ Dinka +dnet:languages @=@ dnet:languages @=@ div @=@ Divehi +dnet:languages @=@ dnet:languages @=@ doi @=@ Dogri +dnet:languages @=@ dnet:languages @=@ dra @=@ Dravidian +dnet:languages @=@ dnet:languages @=@ dua @=@ Duala +dnet:languages @=@ dnet:languages @=@ dut/nld @=@ Dutch; Flemish +dnet:languages @=@ dnet:languages @=@ dyu @=@ Dyula +dnet:languages @=@ dnet:languages @=@ dzo @=@ Dzongkha +dnet:languages @=@ dnet:languages @=@ efi @=@ Efik +dnet:languages @=@ dnet:languages @=@ eka @=@ Ekajuk +dnet:languages @=@ dnet:languages @=@ elx @=@ Elamite +dnet:languages @=@ dnet:languages @=@ eng @=@ English +dnet:languages @=@ dnet:languages @=@ cpe @=@ English-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ esk @=@ Eskimo +dnet:languages @=@ dnet:languages @=@ epo @=@ Esperanto +dnet:languages @=@ dnet:languages @=@ est @=@ Estonian +dnet:languages @=@ dnet:languages @=@ ewe @=@ Ewe +dnet:languages @=@ dnet:languages @=@ ewo @=@ Ewondo +dnet:languages @=@ dnet:languages @=@ fan @=@ Fang +dnet:languages @=@ dnet:languages @=@ fat @=@ Fanti +dnet:languages @=@ dnet:languages @=@ fao @=@ Faroese +dnet:languages @=@ dnet:languages @=@ fij @=@ Fijian +dnet:languages @=@ dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ dnet:languages @=@ fiu @=@ Finno-Ugrian +dnet:languages @=@ dnet:languages @=@ fon @=@ Fon +dnet:languages @=@ dnet:languages @=@ fra/fre @=@ French +dnet:languages @=@ dnet:languages @=@ cpf @=@ French-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ fry @=@ Frisian +dnet:languages @=@ dnet:languages @=@ ful @=@ Fulah +dnet:languages @=@ dnet:languages @=@ gaa @=@ Ga +dnet:languages @=@ dnet:languages @=@ gae/gdh @=@ Gaelic +dnet:languages @=@ dnet:languages @=@ gla @=@ Gaelic; Scottish Gaelic +dnet:languages @=@ dnet:languages @=@ glg @=@ Galician +dnet:languages @=@ dnet:languages @=@ lug @=@ Ganda +dnet:languages @=@ dnet:languages @=@ gay @=@ Gayo +dnet:languages @=@ dnet:languages @=@ gez @=@ Geez +dnet:languages @=@ dnet:languages @=@ geo/kat @=@ Georgian +dnet:languages @=@ dnet:languages @=@ deu/ger @=@ German +dnet:languages @=@ dnet:languages @=@ gem @=@ Germanic +dnet:languages @=@ dnet:languages @=@ kik @=@ Gikuyu; Kikuyu +dnet:languages @=@ dnet:languages @=@ gil @=@ Gilbertese +dnet:languages @=@ dnet:languages @=@ gon @=@ Gondi +dnet:languages @=@ dnet:languages @=@ got @=@ Gothic +dnet:languages @=@ dnet:languages @=@ grb @=@ Grebo +dnet:languages @=@ dnet:languages @=@ ell/gre @=@ Greek +dnet:languages @=@ dnet:languages @=@ gre/ell @=@ Greek, Modern (1453-) +dnet:languages @=@ dnet:languages @=@ kal @=@ Greenlandic; Kalaallisut +dnet:languages @=@ dnet:languages @=@ grn @=@ Guarani +dnet:languages @=@ dnet:languages @=@ guj @=@ Gujarati +dnet:languages @=@ dnet:languages @=@ hai @=@ Haida +dnet:languages @=@ dnet:languages @=@ hat @=@ Haitian; Haitian Creole +dnet:languages @=@ dnet:languages @=@ hau @=@ Hausa +dnet:languages @=@ dnet:languages @=@ haw @=@ Hawaiian +dnet:languages @=@ dnet:languages @=@ heb @=@ Hebrew +dnet:languages @=@ dnet:languages @=@ her @=@ Herero +dnet:languages @=@ dnet:languages @=@ hil @=@ Hiligaynon +dnet:languages @=@ dnet:languages @=@ him @=@ Himachali +dnet:languages @=@ dnet:languages @=@ hin @=@ Hindi +dnet:languages @=@ dnet:languages @=@ hmo @=@ Hiri Motu +dnet:languages @=@ dnet:languages @=@ hun @=@ Hungarian +dnet:languages @=@ dnet:languages @=@ hup @=@ Hupa +dnet:languages @=@ dnet:languages @=@ iba @=@ Iban +dnet:languages @=@ dnet:languages @=@ ice/isl @=@ Icelandic +dnet:languages @=@ dnet:languages @=@ ido @=@ Ido +dnet:languages @=@ dnet:languages @=@ ibo @=@ Igbo +dnet:languages @=@ dnet:languages @=@ ijo @=@ Ijo +dnet:languages @=@ dnet:languages @=@ ilo @=@ Iloko +dnet:languages @=@ dnet:languages @=@ inc @=@ Indic +dnet:languages @=@ dnet:languages @=@ ine @=@ Indo-European +dnet:languages @=@ dnet:languages @=@ ind @=@ Indonesian +dnet:languages @=@ dnet:languages @=@ ile @=@ Interlingue +dnet:languages @=@ dnet:languages @=@ iku @=@ Inuktitut +dnet:languages @=@ dnet:languages @=@ ipk @=@ Inupiaq +dnet:languages @=@ dnet:languages @=@ ira @=@ Iranian +dnet:languages @=@ dnet:languages @=@ gai/iri @=@ Irish +dnet:languages @=@ dnet:languages @=@ iro @=@ Iroquoian +dnet:languages @=@ dnet:languages @=@ ita @=@ Italian +dnet:languages @=@ dnet:languages @=@ jpn @=@ Japanese +dnet:languages @=@ dnet:languages @=@ jav @=@ Javanese +dnet:languages @=@ dnet:languages @=@ jrb @=@ Judeo-Arabic +dnet:languages @=@ dnet:languages @=@ jpr @=@ Judeo-Persian +dnet:languages @=@ dnet:languages @=@ kab @=@ Kabyle +dnet:languages @=@ dnet:languages @=@ kac @=@ Kachin +dnet:languages @=@ dnet:languages @=@ kam @=@ Kamba +dnet:languages @=@ dnet:languages @=@ kan @=@ Kannada +dnet:languages @=@ dnet:languages @=@ kau @=@ Kanuri +dnet:languages @=@ dnet:languages @=@ kaa @=@ Kara-Kalpak +dnet:languages @=@ dnet:languages @=@ kar @=@ Karen +dnet:languages @=@ dnet:languages @=@ kas @=@ Kashmiri +dnet:languages @=@ dnet:languages @=@ kaw @=@ Kawi +dnet:languages @=@ dnet:languages @=@ kaz @=@ Kazakh +dnet:languages @=@ dnet:languages @=@ kha @=@ Khasi +dnet:languages @=@ dnet:languages @=@ khm @=@ Khmer +dnet:languages @=@ dnet:languages @=@ khi @=@ Khoisan +dnet:languages @=@ dnet:languages @=@ kho @=@ Khotanese +dnet:languages @=@ dnet:languages @=@ kin @=@ Kinyarwanda +dnet:languages @=@ dnet:languages @=@ kir @=@ Kirghiz +dnet:languages @=@ dnet:languages @=@ kom @=@ Komi +dnet:languages @=@ dnet:languages @=@ kon @=@ Kongo +dnet:languages @=@ dnet:languages @=@ kok @=@ Konkani +dnet:languages @=@ dnet:languages @=@ kor @=@ Korean +dnet:languages @=@ dnet:languages @=@ kpe @=@ Kpelle +dnet:languages @=@ dnet:languages @=@ kro @=@ Kru +dnet:languages @=@ dnet:languages @=@ kua @=@ Kuanyama; Kwanyama +dnet:languages @=@ dnet:languages @=@ kum @=@ Kumyk +dnet:languages @=@ dnet:languages @=@ kur @=@ Kurdish +dnet:languages @=@ dnet:languages @=@ kru @=@ Kurukh +dnet:languages @=@ dnet:languages @=@ kus @=@ Kusaie +dnet:languages @=@ dnet:languages @=@ kut @=@ Kutenai +dnet:languages @=@ dnet:languages @=@ lad @=@ Ladino +dnet:languages @=@ dnet:languages @=@ lah @=@ Lahnda +dnet:languages @=@ dnet:languages @=@ lam @=@ Lamba +dnet:languages @=@ dnet:languages @=@ lao @=@ Lao +dnet:languages @=@ dnet:languages @=@ lat @=@ Latin +dnet:languages @=@ dnet:languages @=@ lav @=@ Latvian +dnet:languages @=@ dnet:languages @=@ ltz @=@ Letzeburgesch; Luxembourgish +dnet:languages @=@ dnet:languages @=@ lez @=@ Lezghian +dnet:languages @=@ dnet:languages @=@ lim @=@ Limburgan; Limburger; Limburgish +dnet:languages @=@ dnet:languages @=@ lin @=@ Lingala +dnet:languages @=@ dnet:languages @=@ lit @=@ Lithuanian +dnet:languages @=@ dnet:languages @=@ loz @=@ Lozi +dnet:languages @=@ dnet:languages @=@ lub @=@ Luba-Katanga +dnet:languages @=@ dnet:languages @=@ lui @=@ Luiseno +dnet:languages @=@ dnet:languages @=@ lun @=@ Lunda +dnet:languages @=@ dnet:languages @=@ luo @=@ Luo +dnet:languages @=@ dnet:languages @=@ mac/mak @=@ Macedonian +dnet:languages @=@ dnet:languages @=@ mad @=@ Madurese +dnet:languages @=@ dnet:languages @=@ mag @=@ Magahi +dnet:languages @=@ dnet:languages @=@ mai @=@ Maithili +dnet:languages @=@ dnet:languages @=@ mak @=@ Makasar +dnet:languages @=@ dnet:languages @=@ mlg @=@ Malagasy +dnet:languages @=@ dnet:languages @=@ may/msa @=@ Malay +dnet:languages @=@ dnet:languages @=@ mal @=@ Malayalam +dnet:languages @=@ dnet:languages @=@ mlt @=@ Maltese +dnet:languages @=@ dnet:languages @=@ man @=@ Mandingo +dnet:languages @=@ dnet:languages @=@ mni @=@ Manipuri +dnet:languages @=@ dnet:languages @=@ mno @=@ Manobo +dnet:languages @=@ dnet:languages @=@ glv @=@ Manx +dnet:languages @=@ dnet:languages @=@ mao/mri @=@ Maori +dnet:languages @=@ dnet:languages @=@ mar @=@ Marathi +dnet:languages @=@ dnet:languages @=@ chm @=@ Mari +dnet:languages @=@ dnet:languages @=@ mah @=@ Marshallese +dnet:languages @=@ dnet:languages @=@ mwr @=@ Marwari +dnet:languages @=@ dnet:languages @=@ mas @=@ Masai +dnet:languages @=@ dnet:languages @=@ myn @=@ Mayan +dnet:languages @=@ dnet:languages @=@ men @=@ Mende +dnet:languages @=@ dnet:languages @=@ mic @=@ Micmac +dnet:languages @=@ dnet:languages @=@ dum @=@ Middle Dutch +dnet:languages @=@ dnet:languages @=@ enm @=@ Middle English +dnet:languages @=@ dnet:languages @=@ frm @=@ Middle French +dnet:languages @=@ dnet:languages @=@ gmh @=@ Middle High German +dnet:languages @=@ dnet:languages @=@ mga @=@ Middle Irish +dnet:languages @=@ dnet:languages @=@ min @=@ Minangkabau +dnet:languages @=@ dnet:languages @=@ mis @=@ Miscellaneous +dnet:languages @=@ dnet:languages @=@ moh @=@ Mohawk +dnet:languages @=@ dnet:languages @=@ mol @=@ Moldavian +dnet:languages @=@ dnet:languages @=@ mkh @=@ Mon-Kmer +dnet:languages @=@ dnet:languages @=@ lol @=@ Mongo +dnet:languages @=@ dnet:languages @=@ mon @=@ Mongolian +dnet:languages @=@ dnet:languages @=@ mos @=@ Mossi +dnet:languages @=@ dnet:languages @=@ mul @=@ Multiple languages +dnet:languages @=@ dnet:languages @=@ mun @=@ Munda +dnet:languages @=@ dnet:languages @=@ nau @=@ Nauru +dnet:languages @=@ dnet:languages @=@ nav @=@ Navajo; Navaho +dnet:languages @=@ dnet:languages @=@ nde @=@ Ndebele, North +dnet:languages @=@ dnet:languages @=@ nbl @=@ Ndebele, South +dnet:languages @=@ dnet:languages @=@ ndo @=@ Ndonga +dnet:languages @=@ dnet:languages @=@ nep @=@ Nepali +dnet:languages @=@ dnet:languages @=@ new @=@ Newari +dnet:languages @=@ dnet:languages @=@ nic @=@ Niger-Kordofanian +dnet:languages @=@ dnet:languages @=@ ssa @=@ Nilo-Saharan +dnet:languages @=@ dnet:languages @=@ niu @=@ Niuean +dnet:languages @=@ dnet:languages @=@ non @=@ Norse +dnet:languages @=@ dnet:languages @=@ nai @=@ North American Indian +dnet:languages @=@ dnet:languages @=@ sme @=@ Northern Sami +dnet:languages @=@ dnet:languages @=@ nor @=@ Norwegian +dnet:languages @=@ dnet:languages @=@ nno @=@ Norwegian Nynorsk; Nynorsk, Norwegian +dnet:languages @=@ dnet:languages @=@ nub @=@ Nubian +dnet:languages @=@ dnet:languages @=@ nym @=@ Nyamwezi +dnet:languages @=@ dnet:languages @=@ nyn @=@ Nyankole +dnet:languages @=@ dnet:languages @=@ nyo @=@ Nyoro +dnet:languages @=@ dnet:languages @=@ nzi @=@ Nzima +dnet:languages @=@ dnet:languages @=@ oci @=@ Occitan (post 1500); Provençal +dnet:languages @=@ dnet:languages @=@ oji @=@ Ojibwa +dnet:languages @=@ dnet:languages @=@ ang @=@ Old English +dnet:languages @=@ dnet:languages @=@ fro @=@ Old French +dnet:languages @=@ dnet:languages @=@ goh @=@ Old High German +dnet:languages @=@ dnet:languages @=@ ori @=@ Oriya +dnet:languages @=@ dnet:languages @=@ orm @=@ Oromo +dnet:languages @=@ dnet:languages @=@ osa @=@ Osage +dnet:languages @=@ dnet:languages @=@ oss @=@ Ossetian; Ossetic +dnet:languages @=@ dnet:languages @=@ oto @=@ Otomian +dnet:languages @=@ dnet:languages @=@ ota @=@ Ottoman +dnet:languages @=@ dnet:languages @=@ pal @=@ Pahlavi +dnet:languages @=@ dnet:languages @=@ pau @=@ Palauan +dnet:languages @=@ dnet:languages @=@ pli @=@ Pali +dnet:languages @=@ dnet:languages @=@ pam @=@ Pampanga +dnet:languages @=@ dnet:languages @=@ pag @=@ Pangasinan +dnet:languages @=@ dnet:languages @=@ pan @=@ Panjabi; Punjabi +dnet:languages @=@ dnet:languages @=@ pap @=@ Papiamento +dnet:languages @=@ dnet:languages @=@ paa @=@ Papuan-Australian +dnet:languages @=@ dnet:languages @=@ fas/per @=@ Persian +dnet:languages @=@ dnet:languages @=@ peo @=@ Persian, Old (ca 600 - 400 B.C.) +dnet:languages @=@ dnet:languages @=@ phn @=@ Phoenician +dnet:languages @=@ dnet:languages @=@ pol @=@ Polish +dnet:languages @=@ dnet:languages @=@ pon @=@ Ponape +dnet:languages @=@ dnet:languages @=@ por @=@ Portuguese +dnet:languages @=@ dnet:languages @=@ cpp @=@ Portuguese-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ pra @=@ Prakrit +dnet:languages @=@ dnet:languages @=@ pro @=@ Provencal +dnet:languages @=@ dnet:languages @=@ pus @=@ Pushto +dnet:languages @=@ dnet:languages @=@ que @=@ Quechua +dnet:languages @=@ dnet:languages @=@ roh @=@ Raeto-Romance +dnet:languages @=@ dnet:languages @=@ raj @=@ Rajasthani +dnet:languages @=@ dnet:languages @=@ rar @=@ Rarotongan +dnet:languages @=@ dnet:languages @=@ roa @=@ Romance +dnet:languages @=@ dnet:languages @=@ ron/rum @=@ Romanian +dnet:languages @=@ dnet:languages @=@ rom @=@ Romany +dnet:languages @=@ dnet:languages @=@ run @=@ Rundi +dnet:languages @=@ dnet:languages @=@ rus @=@ Russian +dnet:languages @=@ dnet:languages @=@ sal @=@ Salishan +dnet:languages @=@ dnet:languages @=@ sam @=@ Samaritan +dnet:languages @=@ dnet:languages @=@ smi @=@ Sami +dnet:languages @=@ dnet:languages @=@ smo @=@ Samoan +dnet:languages @=@ dnet:languages @=@ sad @=@ Sandawe +dnet:languages @=@ dnet:languages @=@ sag @=@ Sango +dnet:languages @=@ dnet:languages @=@ san @=@ Sanskrit +dnet:languages @=@ dnet:languages @=@ srd @=@ Sardinian +dnet:languages @=@ dnet:languages @=@ sco @=@ Scots +dnet:languages @=@ dnet:languages @=@ sel @=@ Selkup +dnet:languages @=@ dnet:languages @=@ sem @=@ Semitic +dnet:languages @=@ dnet:languages @=@ srp @=@ Serbian +dnet:languages @=@ dnet:languages @=@ scr @=@ Serbo-Croatian +dnet:languages @=@ dnet:languages @=@ srr @=@ Serer +dnet:languages @=@ dnet:languages @=@ shn @=@ Shan +dnet:languages @=@ dnet:languages @=@ sna @=@ Shona +dnet:languages @=@ dnet:languages @=@ iii @=@ Sichuan Yi +dnet:languages @=@ dnet:languages @=@ sid @=@ Sidamo +dnet:languages @=@ dnet:languages @=@ bla @=@ Siksika +dnet:languages @=@ dnet:languages @=@ snd @=@ Sindhi +dnet:languages @=@ dnet:languages @=@ sin @=@ Sinhala; Sinhalese +dnet:languages @=@ dnet:languages @=@ sit @=@ Sino-Tibetan +dnet:languages @=@ dnet:languages @=@ sio @=@ Siouan +dnet:languages @=@ dnet:languages @=@ sla @=@ Slavic +dnet:languages @=@ dnet:languages @=@ slk/slo @=@ Slovak +dnet:languages @=@ dnet:languages @=@ slv @=@ Slovenian +dnet:languages @=@ dnet:languages @=@ sog @=@ Sogdian +dnet:languages @=@ dnet:languages @=@ som @=@ Somali +dnet:languages @=@ dnet:languages @=@ son @=@ Songhai +dnet:languages @=@ dnet:languages @=@ wen @=@ Sorbian +dnet:languages @=@ dnet:languages @=@ nso @=@ Sotho +dnet:languages @=@ dnet:languages @=@ sot @=@ Sotho, Southern +dnet:languages @=@ dnet:languages @=@ sai @=@ South American Indian +dnet:languages @=@ dnet:languages @=@ esl/spa @=@ Spanish +dnet:languages @=@ dnet:languages @=@ spa @=@ Spanish; Castilian +dnet:languages @=@ dnet:languages @=@ suk @=@ Sukuma +dnet:languages @=@ dnet:languages @=@ sux @=@ Sumerian +dnet:languages @=@ dnet:languages @=@ sun @=@ Sundanese +dnet:languages @=@ dnet:languages @=@ sus @=@ Susu +dnet:languages @=@ dnet:languages @=@ swa @=@ Swahili +dnet:languages @=@ dnet:languages @=@ ssw @=@ Swati +dnet:languages @=@ dnet:languages @=@ swe @=@ Swedish +dnet:languages @=@ dnet:languages @=@ syr @=@ Syriac +dnet:languages @=@ dnet:languages @=@ tgl @=@ Tagalog +dnet:languages @=@ dnet:languages @=@ tah @=@ Tahitian +dnet:languages @=@ dnet:languages @=@ tgk @=@ Tajik +dnet:languages @=@ dnet:languages @=@ tmh @=@ Tamashek +dnet:languages @=@ dnet:languages @=@ tam @=@ Tamil +dnet:languages @=@ dnet:languages @=@ tat @=@ Tatar +dnet:languages @=@ dnet:languages @=@ tel @=@ Telugu +dnet:languages @=@ dnet:languages @=@ ter @=@ Tereno +dnet:languages @=@ dnet:languages @=@ tha @=@ Thai +dnet:languages @=@ dnet:languages @=@ bod/tib @=@ Tibetan +dnet:languages @=@ dnet:languages @=@ tig @=@ Tigre +dnet:languages @=@ dnet:languages @=@ tir @=@ Tigrinya +dnet:languages @=@ dnet:languages @=@ tem @=@ Timne +dnet:languages @=@ dnet:languages @=@ tiv @=@ Tivi +dnet:languages @=@ dnet:languages @=@ tli @=@ Tlingit +dnet:languages @=@ dnet:languages @=@ ton @=@ Tonga (Tonga Islands) +dnet:languages @=@ dnet:languages @=@ tog @=@ Tonga(Nyasa) +dnet:languages @=@ dnet:languages @=@ tru @=@ Truk +dnet:languages @=@ dnet:languages @=@ tsi @=@ Tsimshian +dnet:languages @=@ dnet:languages @=@ tso @=@ Tsonga +dnet:languages @=@ dnet:languages @=@ tsn @=@ Tswana +dnet:languages @=@ dnet:languages @=@ tum @=@ Tumbuka +dnet:languages @=@ dnet:languages @=@ tur @=@ Turkish +dnet:languages @=@ dnet:languages @=@ tuk @=@ Turkmen +dnet:languages @=@ dnet:languages @=@ tyv @=@ Tuvinian +dnet:languages @=@ dnet:languages @=@ twi @=@ Twi +dnet:languages @=@ dnet:languages @=@ uga @=@ Ugaritic +dnet:languages @=@ dnet:languages @=@ uig @=@ Uighur; Uyghur +dnet:languages @=@ dnet:languages @=@ ukr @=@ Ukrainian +dnet:languages @=@ dnet:languages @=@ umb @=@ Umbundu +dnet:languages @=@ dnet:languages @=@ und @=@ Undetermined +dnet:languages @=@ dnet:languages @=@ urd @=@ Urdu +dnet:languages @=@ dnet:languages @=@ uzb @=@ Uzbek +dnet:languages @=@ dnet:languages @=@ vai @=@ Vai +dnet:languages @=@ dnet:languages @=@ ven @=@ Venda +dnet:languages @=@ dnet:languages @=@ vie @=@ Vietnamese +dnet:languages @=@ dnet:languages @=@ vol @=@ Volapük +dnet:languages @=@ dnet:languages @=@ vot @=@ Votic +dnet:languages @=@ dnet:languages @=@ wak @=@ Wakashan +dnet:languages @=@ dnet:languages @=@ wal @=@ Walamo +dnet:languages @=@ dnet:languages @=@ wln @=@ Walloon +dnet:languages @=@ dnet:languages @=@ war @=@ Waray +dnet:languages @=@ dnet:languages @=@ was @=@ Washo +dnet:languages @=@ dnet:languages @=@ cym/wel @=@ Welsh +dnet:languages @=@ dnet:languages @=@ wol @=@ Wolof +dnet:languages @=@ dnet:languages @=@ xho @=@ Xhosa +dnet:languages @=@ dnet:languages @=@ sah @=@ Yakut +dnet:languages @=@ dnet:languages @=@ yao @=@ Yao +dnet:languages @=@ dnet:languages @=@ yap @=@ Yap +dnet:languages @=@ dnet:languages @=@ yid @=@ Yiddish +dnet:languages @=@ dnet:languages @=@ yor @=@ Yoruba +dnet:languages @=@ dnet:languages @=@ zap @=@ Zapotec +dnet:languages @=@ dnet:languages @=@ zen @=@ Zenaga +dnet:languages @=@ dnet:languages @=@ zha @=@ Zhuang; Chuang +dnet:languages @=@ dnet:languages @=@ zul @=@ Zulu +dnet:languages @=@ dnet:languages @=@ zun @=@ Zuni +dnet:languages @=@ dnet:languages @=@ sga @=@ old Irish +nsf:contractTypes @=@ NSF Contract Types @=@ BOA/Task Order @=@ BOA/Task Order +nsf:contractTypes @=@ NSF Contract Types @=@ Continuing grant @=@ Continuing grant +nsf:contractTypes @=@ NSF Contract Types @=@ Contract @=@ Contract +nsf:contractTypes @=@ NSF Contract Types @=@ Contract Interagency Agreement @=@ Contract Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Cooperative Agreement @=@ Cooperative Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Fellowship @=@ Fellowship +nsf:contractTypes @=@ NSF Contract Types @=@ Fixed Price Award @=@ Fixed Price Award +nsf:contractTypes @=@ NSF Contract Types @=@ GAA @=@ GAA +nsf:contractTypes @=@ NSF Contract Types @=@ Interagency Agreement @=@ Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Intergovernmental Personnel Award @=@ Intergovernmental Personnel Award +nsf:contractTypes @=@ NSF Contract Types @=@ Personnel Agreement @=@ Personnel Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Standard Grant @=@ Standard Grant +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasframeworkprogram @=@ hasframeworkprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasprogram @=@ hasprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasspecificprogram @=@ hasspecificprogram +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ collection @=@ collection +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ dataset @=@ dataset +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ event @=@ event +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ film @=@ film +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ image @=@ image +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ interactiveResource @=@ interactiveResource +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ model @=@ model +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ physicalObject @=@ physicalObject +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ service @=@ service +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ software @=@ software +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ sound @=@ sound +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ text @=@ text +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ clinicalTrial @=@ Clinical trial +dnet:dataCite_title @=@ dnet:dataCite_title @=@ alternative title @=@ alternative title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ main title @=@ main title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ subtitle @=@ subtitle +dnet:dataCite_title @=@ dnet:dataCite_title @=@ translated title @=@ translated title +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsCitedBy @=@ IsCitedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsNewVersionOf @=@ IsNewVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPartOf @=@ IsPartOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsReferencedBy @=@ IsReferencedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ References @=@ References +datacite:relation_typologies @=@ datacite:relation_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:result_typologies @=@ dnet:result_typologies @=@ dataset @=@ dataset +dnet:result_typologies @=@ dnet:result_typologies @=@ other @=@ other +dnet:result_typologies @=@ dnet:result_typologies @=@ publication @=@ publication +dnet:result_typologies @=@ dnet:result_typologies @=@ software @=@ software +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-ADG @=@ Advanced Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-CSA @=@ Bio-based Industries Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-DEMO @=@ Bio-based Industries Innovation action - Demonstration +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-FLAG @=@ Bio-based Industries Innovation action - Flagship +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-RIA @=@ Bio-based Industries Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-CAR @=@ CAR – Career Restart panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-EJP @=@ COFUND (European Joint Programme) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PCP @=@ COFUND (PCP) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PPI @=@ COFUND (PPI) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-CSA @=@ CS2 Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-IA @=@ CS2 Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-RIA @=@ CS2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA-LS @=@ CSA Lump sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-COG @=@ Consolidator Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-CSA @=@ Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA @=@ Coordination and support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-DP @=@ Doctoral programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-CSA @=@ ECSEL Coordination & Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-IA @=@ ECSEL Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-RIA @=@ ECSEL Research and Innovation Actions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERA-NET-Cofund @=@ ERA-NET Cofund +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC-LS @=@ ERC Proof of Concept Lump Sum Pilot +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-SyG @=@ ERC Synergy Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-LVG @=@ ERC low value grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ H2020-EEN-SGA @=@ Enterprise Europe Network - Specific Grant Agreement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EID @=@ European Industrial Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EJD @=@ European Joint Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-ETN @=@ European Training Networks +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-IA @=@ FCH2 Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-RIA @=@ FCH2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-FP @=@ Fellowship programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-GF @=@ Global Fellowships +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-CSA @=@ IMI2 Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-RIA @=@ IMI2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PCP @=@ Pre-Commercial Procurement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC @=@ Proof of Concept Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PPI @=@ Public Procurement of Innovative Solutions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-RI @=@ RI – Reintegration panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-RISE @=@ RISE +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA-LS @=@ Research and Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA-LS @=@ Research and Innovation action Lump Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-CSA @=@ SESAR: Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-IA @=@ SESAR: Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-RIA @=@ SESAR: Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-RIA @=@ SGA Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2b @=@ SME Instrument (grant only and blended finance) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-1 @=@ SME instrument phase 1 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2 @=@ SME instrument phase 2 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-CSA @=@ Shift2Rail - Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-SE @=@ Society and Enterprise panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-CSA @=@ Specific Grant agreement and Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-ST @=@ Standard EF +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-STG @=@ Starting Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-SNLS @=@ Grant to identified beneficiary - Coordination and support actions (MSCA-Special Needs lump sum) +wt:contractTypes @=@ wt:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ dnet:countries @=@ AL @=@ Albania +dnet:countries @=@ dnet:countries @=@ DZ @=@ Algeria +dnet:countries @=@ dnet:countries @=@ AS @=@ American Samoa +dnet:countries @=@ dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ dnet:countries @=@ AI @=@ Anguilla +dnet:countries @=@ dnet:countries @=@ AQ @=@ Antarctica +dnet:countries @=@ dnet:countries @=@ AG @=@ Antigua and Barbuda +dnet:countries @=@ dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ dnet:countries @=@ AM @=@ Armenia +dnet:countries @=@ dnet:countries @=@ AW @=@ Aruba +dnet:countries @=@ dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ dnet:countries @=@ AZ @=@ Azerbaijan +dnet:countries @=@ dnet:countries @=@ BS @=@ Bahamas +dnet:countries @=@ dnet:countries @=@ BH @=@ Bahrain +dnet:countries @=@ dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ dnet:countries @=@ BB @=@ Barbados +dnet:countries @=@ dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ dnet:countries @=@ BZ @=@ Belize +dnet:countries @=@ dnet:countries @=@ BJ @=@ Benin +dnet:countries @=@ dnet:countries @=@ BM @=@ Bermuda +dnet:countries @=@ dnet:countries @=@ BT @=@ Bhutan +dnet:countries @=@ dnet:countries @=@ BO @=@ Bolivia +dnet:countries @=@ dnet:countries @=@ BQ @=@ Bonaire, Sint Eustatius and Saba +dnet:countries @=@ dnet:countries @=@ BA @=@ Bosnia and Herzegovina +dnet:countries @=@ dnet:countries @=@ BW @=@ Botswana +dnet:countries @=@ dnet:countries @=@ BV @=@ Bouvet Island +dnet:countries @=@ dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ dnet:countries @=@ IO @=@ British Indian Ocean Territory +dnet:countries @=@ dnet:countries @=@ BN @=@ Brunei Darussalam +dnet:countries @=@ dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ dnet:countries @=@ BF @=@ Burkina Faso +dnet:countries @=@ dnet:countries @=@ BI @=@ Burundi +dnet:countries @=@ dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ dnet:countries @=@ CM @=@ Cameroon +dnet:countries @=@ dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ dnet:countries @=@ KY @=@ Cayman Islands +dnet:countries @=@ dnet:countries @=@ CF @=@ Central African Republic +dnet:countries @=@ dnet:countries @=@ TD @=@ Chad +dnet:countries @=@ dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ dnet:countries @=@ CN @=@ China (People's Republic of) +dnet:countries @=@ dnet:countries @=@ CX @=@ Christmas Island +dnet:countries @=@ dnet:countries @=@ CC @=@ Cocos (Keeling) Islands +dnet:countries @=@ dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ dnet:countries @=@ KM @=@ Comoros +dnet:countries @=@ dnet:countries @=@ CG @=@ Congo +dnet:countries @=@ dnet:countries @=@ CD @=@ Congo (Democratic Republic of) +dnet:countries @=@ dnet:countries @=@ CK @=@ Cook Islands +dnet:countries @=@ dnet:countries @=@ CR @=@ Costa Rica +dnet:countries @=@ dnet:countries @=@ CI @=@ Cote d'Ivoire +dnet:countries @=@ dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ dnet:countries @=@ CU @=@ Cuba +dnet:countries @=@ dnet:countries @=@ CW @=@ Curaçao +dnet:countries @=@ dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ dnet:countries @=@ DJ @=@ Djibouti +dnet:countries @=@ dnet:countries @=@ DM @=@ Dominica +dnet:countries @=@ dnet:countries @=@ DO @=@ Dominican Republic +dnet:countries @=@ dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ dnet:countries @=@ SV @=@ El Salvador +dnet:countries @=@ dnet:countries @=@ GQ @=@ Equatorial Guinea +dnet:countries @=@ dnet:countries @=@ ER @=@ Eritrea +dnet:countries @=@ dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ dnet:countries @=@ ET @=@ Ethiopia +dnet:countries @=@ dnet:countries @=@ EU @=@ European Union +dnet:countries @=@ dnet:countries @=@ FK @=@ Falkland Islands (Malvinas) +dnet:countries @=@ dnet:countries @=@ FO @=@ Faroe Islands +dnet:countries @=@ dnet:countries @=@ FJ @=@ Fiji +dnet:countries @=@ dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ dnet:countries @=@ MK @=@ Former Yugoslav Republic of Macedonia +dnet:countries @=@ dnet:countries @=@ FR @=@ France +dnet:countries @=@ dnet:countries @=@ GF @=@ French Guiana +dnet:countries @=@ dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ dnet:countries @=@ GA @=@ Gabon +dnet:countries @=@ dnet:countries @=@ GM @=@ Gambia +dnet:countries @=@ dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ dnet:countries @=@ GH @=@ Ghana +dnet:countries @=@ dnet:countries @=@ GI @=@ Gibraltar +dnet:countries @=@ dnet:countries @=@ GR @=@ Greece +dnet:countries @=@ dnet:countries @=@ GL @=@ Greenland +dnet:countries @=@ dnet:countries @=@ GD @=@ Grenada +dnet:countries @=@ dnet:countries @=@ GP @=@ Guadeloupe +dnet:countries @=@ dnet:countries @=@ GU @=@ Guam +dnet:countries @=@ dnet:countries @=@ GT @=@ Guatemala +dnet:countries @=@ dnet:countries @=@ GG @=@ Guernsey +dnet:countries @=@ dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ dnet:countries @=@ GY @=@ Guyana +dnet:countries @=@ dnet:countries @=@ HT @=@ Haiti +dnet:countries @=@ dnet:countries @=@ HM @=@ Heard Island and McDonald Islands +dnet:countries @=@ dnet:countries @=@ VA @=@ Holy See (Vatican City State) +dnet:countries @=@ dnet:countries @=@ HN @=@ Honduras +dnet:countries @=@ dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ dnet:countries @=@ IS @=@ Iceland +dnet:countries @=@ dnet:countries @=@ IN @=@ India +dnet:countries @=@ dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ dnet:countries @=@ IR @=@ Iran (Islamic Republic of) +dnet:countries @=@ dnet:countries @=@ IQ @=@ Iraq +dnet:countries @=@ dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ dnet:countries @=@ IM @=@ Isle of Man +dnet:countries @=@ dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ dnet:countries @=@ JE @=@ Jersey +dnet:countries @=@ dnet:countries @=@ JO @=@ Jordan +dnet:countries @=@ dnet:countries @=@ KZ @=@ Kazakhstan +dnet:countries @=@ dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ dnet:countries @=@ KI @=@ Kiribati +dnet:countries @=@ dnet:countries @=@ KR @=@ Korea (Republic of) +dnet:countries @=@ dnet:countries @=@ KP @=@ Korea, Democatric People's Republic of +dnet:countries @=@ dnet:countries @=@ XK @=@ Kosovo * UN resolution +dnet:countries @=@ dnet:countries @=@ KW @=@ Kuwait +dnet:countries @=@ dnet:countries @=@ KG @=@ Kyrgyzstan +dnet:countries @=@ dnet:countries @=@ LA @=@ Lao (People's Democratic Republic) +dnet:countries @=@ dnet:countries @=@ LV @=@ Latvia +dnet:countries @=@ dnet:countries @=@ LB @=@ Lebanon +dnet:countries @=@ dnet:countries @=@ LS @=@ Lesotho +dnet:countries @=@ dnet:countries @=@ LR @=@ Liberia +dnet:countries @=@ dnet:countries @=@ LY @=@ Libyan Arab Jamahiriya +dnet:countries @=@ dnet:countries @=@ LI @=@ Liechtenstein +dnet:countries @=@ dnet:countries @=@ LT @=@ Lithuania +dnet:countries @=@ dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ dnet:countries @=@ MW @=@ Malawi +dnet:countries @=@ dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ dnet:countries @=@ MV @=@ Maldives +dnet:countries @=@ dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ dnet:countries @=@ MH @=@ Marshall Islands +dnet:countries @=@ dnet:countries @=@ MQ @=@ Martinique +dnet:countries @=@ dnet:countries @=@ MR @=@ Mauritania +dnet:countries @=@ dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ dnet:countries @=@ YT @=@ Mayotte +dnet:countries @=@ dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ dnet:countries @=@ FM @=@ Micronesia, Federated States of +dnet:countries @=@ dnet:countries @=@ MD @=@ Moldova (Republic of) +dnet:countries @=@ dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ dnet:countries @=@ ME @=@ Montenegro +dnet:countries @=@ dnet:countries @=@ MS @=@ Montserrat +dnet:countries @=@ dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ dnet:countries @=@ MM @=@ Myanmar +dnet:countries @=@ dnet:countries @=@ NA @=@ Namibia +dnet:countries @=@ dnet:countries @=@ NR @=@ Nauru +dnet:countries @=@ dnet:countries @=@ NP @=@ Nepal +dnet:countries @=@ dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ dnet:countries @=@ NC @=@ New Caledonia +dnet:countries @=@ dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ dnet:countries @=@ NI @=@ Nicaragua +dnet:countries @=@ dnet:countries @=@ NE @=@ Niger +dnet:countries @=@ dnet:countries @=@ NG @=@ Nigeria +dnet:countries @=@ dnet:countries @=@ NU @=@ Niue +dnet:countries @=@ dnet:countries @=@ NF @=@ Norfolk Island +dnet:countries @=@ dnet:countries @=@ MP @=@ Northern Mariana Islands +dnet:countries @=@ dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ dnet:countries @=@ OC @=@ Oceania +dnet:countries @=@ dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ dnet:countries @=@ PW @=@ Palau +dnet:countries @=@ dnet:countries @=@ PS @=@ Palestinian-administered areas +dnet:countries @=@ dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ dnet:countries @=@ PG @=@ Papua New Guinea +dnet:countries @=@ dnet:countries @=@ PY @=@ Paraguay +dnet:countries @=@ dnet:countries @=@ PE @=@ Peru +dnet:countries @=@ dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ dnet:countries @=@ PN @=@ Pitcairn +dnet:countries @=@ dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ dnet:countries @=@ QA @=@ Qatar +dnet:countries @=@ dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ dnet:countries @=@ RW @=@ Rwanda +dnet:countries @=@ dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ dnet:countries @=@ SH @=@ Saint Helena, Ascension and Tristan da Cunha +dnet:countries @=@ dnet:countries @=@ KN @=@ Saint Kitts and Nevis +dnet:countries @=@ dnet:countries @=@ LC @=@ Saint Lucia +dnet:countries @=@ dnet:countries @=@ MF @=@ Saint Martin (French Part) +dnet:countries @=@ dnet:countries @=@ PM @=@ Saint Pierre and Miquelon +dnet:countries @=@ dnet:countries @=@ VC @=@ Saint Vincent and the Grenadines +dnet:countries @=@ dnet:countries @=@ BL @=@ Saint-Barthélemy +dnet:countries @=@ dnet:countries @=@ WS @=@ Samoa +dnet:countries @=@ dnet:countries @=@ SM @=@ San Marino +dnet:countries @=@ dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ dnet:countries @=@ SN @=@ Senegal +dnet:countries @=@ dnet:countries @=@ RS @=@ Serbia +dnet:countries @=@ dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ dnet:countries @=@ SC @=@ Seychelles +dnet:countries @=@ dnet:countries @=@ SL @=@ Sierra Leone +dnet:countries @=@ dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ dnet:countries @=@ SX @=@ Sint Maarten (Dutch Part) +dnet:countries @=@ dnet:countries @=@ SK @=@ Slovakia +dnet:countries @=@ dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ dnet:countries @=@ SB @=@ Solomon Islands +dnet:countries @=@ dnet:countries @=@ SO @=@ Somalia +dnet:countries @=@ dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ dnet:countries @=@ GS @=@ South Georgia and the South Sandwich Islands +dnet:countries @=@ dnet:countries @=@ SS @=@ South Sudan +dnet:countries @=@ dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ dnet:countries @=@ SD @=@ Sudan +dnet:countries @=@ dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ dnet:countries @=@ SJ @=@ Svalbard and Jan Mayen +dnet:countries @=@ dnet:countries @=@ SZ @=@ Swaziland +dnet:countries @=@ dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ dnet:countries @=@ SY @=@ Syrian Arab Republic +dnet:countries @=@ dnet:countries @=@ ST @=@ São Tomé and Príncipe +dnet:countries @=@ dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ dnet:countries @=@ TJ @=@ Tajikistan +dnet:countries @=@ dnet:countries @=@ TZ @=@ Tanzania (United Republic of) +dnet:countries @=@ dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ dnet:countries @=@ TL @=@ Timor-Leste +dnet:countries @=@ dnet:countries @=@ TG @=@ Togo +dnet:countries @=@ dnet:countries @=@ TK @=@ Tokelau +dnet:countries @=@ dnet:countries @=@ TO @=@ Tonga +dnet:countries @=@ dnet:countries @=@ TT @=@ Trinidad and Tobago +dnet:countries @=@ dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ dnet:countries @=@ TM @=@ Turkmenistan +dnet:countries @=@ dnet:countries @=@ TC @=@ Turks and Caicos Islands +dnet:countries @=@ dnet:countries @=@ TV @=@ Tuvalu +dnet:countries @=@ dnet:countries @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ UG @=@ Uganda +dnet:countries @=@ dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ dnet:countries @=@ US @=@ United States +dnet:countries @=@ dnet:countries @=@ UM @=@ United States Minor Outlying Islands +dnet:countries @=@ dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ dnet:countries @=@ VU @=@ Vanuatu +dnet:countries @=@ dnet:countries @=@ VE @=@ Venezuela +dnet:countries @=@ dnet:countries @=@ VN @=@ Viet Nam +dnet:countries @=@ dnet:countries @=@ VG @=@ Virgin Islands (British) +dnet:countries @=@ dnet:countries @=@ VI @=@ Virgin Islands, U.S. +dnet:countries @=@ dnet:countries @=@ WF @=@ Wallis and Futuna +dnet:countries @=@ dnet:countries @=@ EH @=@ Western Sahara +dnet:countries @=@ dnet:countries @=@ YE @=@ Yemen +dnet:countries @=@ dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ dnet:countries @=@ ZM @=@ Zambia +dnet:countries @=@ dnet:countries @=@ ZW @=@ Zimbabwe +dnet:countries @=@ dnet:countries @=@ AX @=@ Åland Islands +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver-openaire2.0 @=@ OpenAIRE 2.0+ (DRIVER OA, EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire4.0 @=@ OpenAIRE 4.0 (inst.&thematic. repo.) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ UNKNOWN @=@ not available +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ native @=@ proprietary +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ notCompatible @=@ under validation +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +fct:funding_relations @=@ fct:funding_relations @=@ fct:hasParentFunding @=@ fct:hasParentFunding +dnet:protocols @=@ dnet:protocols @=@ HTTPWithFileName @=@ HTTPWithFileName +dnet:protocols @=@ dnet:protocols @=@ NetCDF @=@ NetCDF +dnet:protocols @=@ dnet:protocols @=@ OpenDAP @=@ OpenDAP +dnet:protocols @=@ dnet:protocols @=@ schemaorg @=@ Schema.org +dnet:protocols @=@ dnet:protocols @=@ UNKNOWN @=@ UNKNOWN +dnet:protocols @=@ dnet:protocols @=@ api @=@ api +dnet:protocols @=@ dnet:protocols @=@ dataciteESPlugins @=@ dataciteESPlugins +dnet:protocols @=@ dnet:protocols @=@ datasetsbyjournal @=@ datasetsbyjournal +dnet:protocols @=@ dnet:protocols @=@ datasetsbyproject @=@ datasetsbyproject +dnet:protocols @=@ dnet:protocols @=@ excelFile @=@ excelFile +dnet:protocols @=@ dnet:protocols @=@ file @=@ file +dnet:protocols @=@ dnet:protocols @=@ fileGzip @=@ fileGzip +dnet:protocols @=@ dnet:protocols @=@ files_by_rpc @=@ files_by_rpc +dnet:protocols @=@ dnet:protocols @=@ files_from_mdstore @=@ files_from_mdstore +dnet:protocols @=@ dnet:protocols @=@ files_from_metadata @=@ files_from_metadata +dnet:protocols @=@ dnet:protocols @=@ filesystem @=@ filesystem +dnet:protocols @=@ dnet:protocols @=@ ftp @=@ ftp +dnet:protocols @=@ dnet:protocols @=@ gristProjects @=@ gristProjects +dnet:protocols @=@ dnet:protocols @=@ gtr2Projects @=@ gtr2Projects +dnet:protocols @=@ dnet:protocols @=@ http @=@ http +dnet:protocols @=@ dnet:protocols @=@ httpCSV @=@ httpCSV +dnet:protocols @=@ dnet:protocols @=@ httpList @=@ httpList +dnet:protocols @=@ dnet:protocols @=@ jdbc @=@ jdbc +dnet:protocols @=@ dnet:protocols @=@ oai @=@ oai +dnet:protocols @=@ dnet:protocols @=@ oai_sets @=@ oai_sets +dnet:protocols @=@ dnet:protocols @=@ other @=@ other +dnet:protocols @=@ dnet:protocols @=@ re3data @=@ re3data +dnet:protocols @=@ dnet:protocols @=@ rest @=@ rest +dnet:protocols @=@ dnet:protocols @=@ rest_json2xml @=@ rest_json2xml +dnet:protocols @=@ dnet:protocols @=@ sftp @=@ sftp +dnet:protocols @=@ dnet:protocols @=@ soap @=@ soap +dnet:protocols @=@ dnet:protocols @=@ sparql @=@ sparql +dnet:protocols @=@ dnet:protocols @=@ sword @=@ sword +dnet:protocols @=@ dnet:protocols @=@ targz @=@ targz +dnet:protocols @=@ dnet:protocols @=@ remoteMdstore @=@ remoteMdstore +wt:funding_typologies @=@ Wellcome Trust: Funding Typologies @=@ wt:fundingStream @=@ Wellcome Trust: Funding Stream +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ accessionNumber @=@ accessionNumber +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ dataset @=@ dataset +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ software @=@ software +datacite:id_typologies @=@ datacite:id_typologies @=@ ARK @=@ ARK +datacite:id_typologies @=@ datacite:id_typologies @=@ DOI @=@ DOI +datacite:id_typologies @=@ datacite:id_typologies @=@ EAN13 @=@ EAN13 +datacite:id_typologies @=@ datacite:id_typologies @=@ EISSN @=@ EISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ Handle @=@ Handle +datacite:id_typologies @=@ datacite:id_typologies @=@ ISBN @=@ ISBN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISSN @=@ ISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISTC @=@ ISTC +datacite:id_typologies @=@ datacite:id_typologies @=@ LISSN @=@ LISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ LSID @=@ LSID +datacite:id_typologies @=@ datacite:id_typologies @=@ PURL @=@ PURL +datacite:id_typologies @=@ datacite:id_typologies @=@ UNKNOWN @=@ UNKNOWN +datacite:id_typologies @=@ datacite:id_typologies @=@ UPC @=@ UPC +datacite:id_typologies @=@ datacite:id_typologies @=@ URL @=@ URL +datacite:id_typologies @=@ datacite:id_typologies @=@ URN @=@ URN +dnet:pid_types @=@ dnet:pid_types @=@ actrn @=@ ACTRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ nct @=@ ClinicalTrials.gov Identifier +dnet:pid_types @=@ dnet:pid_types @=@ euctr @=@ EU Clinical Trials Register +dnet:pid_types @=@ dnet:pid_types @=@ epo_id @=@ European Patent Office application ID +dnet:pid_types @=@ dnet:pid_types @=@ gsk @=@ GSK Identifier +dnet:pid_types @=@ dnet:pid_types @=@ GeoPass @=@ Geographic Location-Password Scheme +dnet:pid_types @=@ dnet:pid_types @=@ GBIF @=@ Global Biodiversity Information Facility +dnet:pid_types @=@ dnet:pid_types @=@ isrctn @=@ ISRCTN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ ISNI @=@ International Standard Name Identifier +dnet:pid_types @=@ dnet:pid_types @=@ jprn @=@ JPRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier +dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative +dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID +dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA +dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format +dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN +dnet:pid_types @=@ dnet:pid_types @=@ VIAF @=@ Virtual International Authority File +dnet:pid_types @=@ dnet:pid_types @=@ arXiv @=@ arXiv +dnet:pid_types @=@ dnet:pid_types @=@ doi @=@ doi +dnet:pid_types @=@ dnet:pid_types @=@ grid @=@ grid +dnet:pid_types @=@ dnet:pid_types @=@ info:eu-repo/dai @=@ info:eu-repo/dai +dnet:pid_types @=@ dnet:pid_types @=@ orcidworkid @=@ orcid workid +dnet:pid_types @=@ dnet:pid_types @=@ pmc @=@ pmc +dnet:pid_types @=@ dnet:pid_types @=@ pmid @=@ pmid +dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn +dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier +dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier +dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/JEL @=@ A Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/OPENACCESS_VERSION @=@ An Open Access versions of your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_REFERENCED_BY @=@ A dataset referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/REFERENCES @=@ A dataset that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_RELATED_TO @=@ A dataset related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO @=@ A dataset that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_RELATED_TO @=@ A publication related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/REFERENCES @=@ A publication referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY @=@ A publication that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY @=@ A publication that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO @=@ A publication that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SOFTWARE @=@ A software referred by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/OPENACCESS_VERSION @=@ Another Open Access version of a publication +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/PID @=@ Another persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/MESHEUROPMC @=@ A classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/ABSTRACT @=@ An abstract describing among your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION_DATE @=@ A date of publication missing in your content +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PID @=@ A persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ACM @=@ Another ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ARXIV @=@ Another ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/DDC @=@ Another Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/JEL @=@ Another Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/MESHEUROPMC @=@ Another classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PROJECT @=@ A project reference that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY @=@ A dataset that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications +dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown +dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed +dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed \ No newline at end of file From 38f2508c87d9c748bc753c9373cb55903214263b Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 28 Jan 2021 08:24:45 +0100 Subject: [PATCH 05/86] new fields in mdstore beans --- .../mdstore/manager/common/model/MDStore.java | 45 +++++++++++++++++-- .../manager/common/model/MDStoreVersion.java | 33 ++++++++++---- .../manager/common/model/MDStoreWithInfo.java | 23 ++++++++++ 3 files changed, 88 insertions(+), 13 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index 68fc024af..345500737 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -2,12 +2,15 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; +import java.util.Date; import java.util.UUID; import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; import javax.persistence.Table; +import javax.persistence.Temporal; +import javax.persistence.TemporalType; @Entity @Table(name = "mdstores") @@ -38,6 +41,13 @@ public class MDStore implements Serializable { @Column(name = "api_id") private String apiId; + @Column(name = "hdfs_path") + private String hdfsPath; + + @Column(name = "creation_date") + @Temporal(TemporalType.TIMESTAMP) + private Date creationDate; + public String getId() { return id; } @@ -94,9 +104,28 @@ public class MDStore implements Serializable { this.apiId = apiId; } + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } + + public Date getCreationDate() { + return creationDate; + } + + public void setCreationDate(final Date creationDate) { + this.creationDate = creationDate; + } + public static MDStore newInstance( - final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); + final String format, + final String layout, + final String interpretation, + final String hdfsBasePath) { + return newInstance(format, layout, interpretation, null, null, null, hdfsBasePath); } public static MDStore newInstance( @@ -105,15 +134,23 @@ public class MDStore implements Serializable { final String interpretation, final String dsName, final String dsId, - final String apiId) { + final String apiId, + final String hdfsBasePath) { + + final String mdId = "md-" + UUID.randomUUID(); + final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); + md.setId(mdId); md.setFormat(format); md.setLayout(layout); md.setInterpretation(interpretation); + md.setCreationDate(new Date()); md.setDatasourceName(dsName); md.setDatasourceId(dsId); md.setApiId(apiId); + md.setHdfsPath(String.format("%s/%s", hdfsBasePath, mdId)); + return md; } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 7ef24f191..62370c0f5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -38,15 +38,22 @@ public class MDStoreVersion implements Serializable { @Column(name = "size") private long size = 0; - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; + @Column(name = "hdfs_path") + private String hdfsPath; + + public static MDStoreVersion newInstance(final String mdId, final boolean writing, final String hdfsBasePath) { + final MDStoreVersion v = new MDStoreVersion(); + + final String versionId = mdId + "-" + new Date().getTime(); + v.setId(versionId); + v.setMdstore(mdId); + v.setLastUpdate(null); + v.setWriting(writing); + v.setReadCount(0); + v.setSize(0); + v.setHdfsPath(String.format("%s/%s/%s", hdfsBasePath, mdId, versionId)); + + return v; } public String getId() { @@ -96,4 +103,12 @@ public class MDStoreVersion implements Serializable { public void setSize(final long size) { this.size = size; } + + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index 438359241..72915a9c8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -43,6 +43,10 @@ public class MDStoreWithInfo implements Serializable { @Column(name = "current_version") private String currentVersion; + @Column(name = "creation_date") + @Temporal(TemporalType.TIMESTAMP) + private Date creationDate; + @Column(name = "lastupdate") @Temporal(TemporalType.TIMESTAMP) private Date lastUpdate; @@ -53,6 +57,9 @@ public class MDStoreWithInfo implements Serializable { @Column(name = "n_versions") private long numberOfVersions = 0; + @Column(name = "hdfs_path") + private String hdfsPath; + public String getId() { return id; } @@ -117,6 +124,14 @@ public class MDStoreWithInfo implements Serializable { this.currentVersion = currentVersion; } + public Date getCreationDate() { + return creationDate; + } + + public void setCreationDate(final Date creationDate) { + this.creationDate = creationDate; + } + public Date getLastUpdate() { return lastUpdate; } @@ -140,4 +155,12 @@ public class MDStoreWithInfo implements Serializable { public void setNumberOfVersions(final long numberOfVersions) { this.numberOfVersions = numberOfVersions; } + + public String getHdfsPath() { + return hdfsPath; + } + + public void setHdfsPath(final String hdfsPath) { + this.hdfsPath = hdfsPath; + } } From 98b9498b5745d1129ed665c2f22a83db595c478a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 28 Jan 2021 09:51:17 +0100 Subject: [PATCH 06/86] Removed old messaging system not quite used from collection and Transformation workflow code refactor --- .../actionmanager/project/utils/ReadCSV.java | 2 +- .../project/utils/ReadExcel.java | 3 +- .../common/AggregationCounter.java | 60 ++++---- .../GenerateNativeStoreSparkJob.java | 133 ++++++----------- .../collection/plugin/CollectorPlugin.java | 4 +- .../plugin/oai/OaiCollectorPlugin.java | 12 +- .../collection/plugin/oai/OaiIterator.java | 20 +-- ...Exception.java => CollectorException.java} | 12 +- .../collection/worker/CollectorWorker.java | 93 ++++++++++++ .../worker/CollectorWorkerApplication.java | 55 +++++++ .../worker/DnetCollectorWorker.java | 139 ------------------ .../DnetCollectorWorkerApplication.java | 49 ------ .../worker/utils/CollectorPluginFactory.java | 8 +- .../worker/utils/HttpConnector.java | 24 +-- .../DnetTransformationException.java | 39 ++--- .../transformation/TransformSparkJobNode.java | 33 ++--- .../transformation/TransformationFactory.java | 87 ++++++----- .../dhp/transformation/xslt/Cleaner.java | 3 +- .../xslt/XSLTTransformationFunction.java | 102 ++++++------- .../collection_input_parameters.json | 36 ----- .../dhp/collection/collector_parameter.json | 6 + .../collection/oozie_app/config-default.xml | 18 +++ .../dhp/collection/oozie_app/workflow.xml | 77 ++++------ .../collector/worker/collector_parameter.json | 12 -- .../oozie_app/config-default.xml | 18 +++ .../dhp/transformation/oozie_app/workflow.xml | 69 ++++----- .../transformation_input_parameters.json | 22 +-- .../project/EXCELParserTest.java | 7 +- .../httpconnector/HttpConnectorTest.java | 9 +- .../DnetCollectorWorkerApplicationTests.java | 49 ++---- .../transformation/TransformationJobTest.java | 86 +++++++---- .../dhp/oa/graph/clean/CleaningFunctions.java | 24 +-- .../raw/MigrateDbEntitiesApplication.java | 2 +- .../raw/GenerateEntitiesApplicationTest.java | 2 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- 35 files changed, 597 insertions(+), 720 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{DnetCollectorException.java => CollectorException.java} (56%) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml delete mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index dc6f46771..ca1c10611 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -17,8 +17,8 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index e665bc704..585a408f3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager.project.utils; import java.io.*; import java.nio.charset.StandardCharsets; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -15,8 +14,8 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java index 1ac2cb54b..bf2fd22cb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationCounter.java @@ -1,45 +1,45 @@ -package eu.dnetlib.dhp.aggregation.common; -import org.apache.spark.util.LongAccumulator; +package eu.dnetlib.dhp.aggregation.common; import java.io.Serializable; +import org.apache.spark.util.LongAccumulator; public class AggregationCounter implements Serializable { - private LongAccumulator totalItems; - private LongAccumulator errorItems; - private LongAccumulator processedItems; + private LongAccumulator totalItems; + private LongAccumulator errorItems; + private LongAccumulator processedItems; - public AggregationCounter() { - } + public AggregationCounter() { + } - public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.processedItems = processedItems; - } + public AggregationCounter(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator processedItems) { + this.totalItems = totalItems; + this.errorItems = errorItems; + this.processedItems = processedItems; + } - public LongAccumulator getTotalItems() { - return totalItems; - } + public LongAccumulator getTotalItems() { + return totalItems; + } - public void setTotalItems(LongAccumulator totalItems) { - this.totalItems = totalItems; - } + public void setTotalItems(LongAccumulator totalItems) { + this.totalItems = totalItems; + } - public LongAccumulator getErrorItems() { - return errorItems; - } + public LongAccumulator getErrorItems() { + return errorItems; + } - public void setErrorItems(LongAccumulator errorItems) { - this.errorItems = errorItems; - } + public void setErrorItems(LongAccumulator errorItems) { + this.errorItems = errorItems; + } - public LongAccumulator getProcessedItems() { - return processedItems; - } + public LongAccumulator getProcessedItems() { + return processedItems; + } - public void setProcessedItems(LongAccumulator processedItems) { - this.processedItems = processedItems; - } + public void setProcessedItems(LongAccumulator processedItems) { + this.processedItems = processedItems; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index c0bd4c940..c9c29b4ea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -5,12 +5,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; import java.util.Objects; import java.util.Optional; -import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -22,7 +19,6 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; @@ -35,9 +31,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; public class GenerateNativeStoreSparkJob { @@ -46,100 +40,62 @@ public class GenerateNativeStoreSparkJob { public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateNativeStoreSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); parser.parseArgument(args); final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final String provenanceArgument = parser.get("provenance"); + log.info("Provenance is {}", provenanceArgument); + final Provenance provenance = jsonMapper.readValue(provenanceArgument, Provenance.class); + final String dateOfCollectionArgs = parser.get("dateOfCollection"); + log.info("dateOfCollection is {}", dateOfCollectionArgs); + final long dateOfCollection = new Long(dateOfCollectionArgs); + final String sequenceFileInputPath = parser.get("input"); + log.info("sequenceFileInputPath is {}", dateOfCollectionArgs); Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - SparkConf conf = new SparkConf(); runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + conf, + isSparkSessionManaged, + spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaPairRDD inputRDD = sc - .sequenceFile(parser.get("input"), IntWritable.class, Text.class); + final JavaPairRDD inputRDD = sc + .sequenceFile(sequenceFileInputPath, IntWritable.class, Text.class); - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); + final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - final MessageManager manager = new MessageManager( - parser.get("rabbitHost"), - parser.get("rabbitUser"), - parser.get("rabbitPassword"), - false, - false, - null); + final JavaRDD nativeStore = inputRDD + .map( + item -> parseRecord( + item._2().toString(), + parser.get("xpath"), + parser.get("encoding"), + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); - final JavaRDD mappeRDD = inputRDD - .map( - item -> parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); + mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "0"); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } + mdstore.write().format("parquet").save(parser.get("output")); - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "" + totalItems.value()); - if (!test) { - manager - .sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + invalidRecords.value()); - reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); - if (!test) { - manager - .sendMessage( - new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - parser.get("rabbitReportQueue"), - true, - false); - manager.close(); - } - }); + }); } @@ -166,12 +122,9 @@ public class GenerateNativeStoreSparkJob { } return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); } catch (Throwable e) { - if (invalidRecords != null) - invalidRecords.add(1); - e.printStackTrace(); + invalidRecords.add(1); return null; } } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 4a0c70c45..7146e610e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -4,9 +4,9 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.CollectorException; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws DnetCollectorException; + Stream collect(ApiDescriptor api) throws CollectorException; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 7f71f401d..c4c52271a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -15,7 +15,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.CollectorException; public class OaiCollectorPlugin implements CollectorPlugin { @@ -27,7 +27,7 @@ public class OaiCollectorPlugin implements CollectorPlugin { private OaiIteratorFactory oaiIteratorFactory; @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + public Stream collect(final ApiDescriptor api) throws CollectorException { final String baseUrl = api.getBaseUrl(); final String mdFormat = api.getParams().get(FORMAT_PARAM); final String setParam = api.getParams().get(OAI_SET_PARAM); @@ -46,19 +46,19 @@ public class OaiCollectorPlugin implements CollectorPlugin { } if (baseUrl == null || baseUrl.isEmpty()) { - throw new DnetCollectorException("Param 'baseurl' is null or empty"); + throw new CollectorException("Param 'baseurl' is null or empty"); } if (mdFormat == null || mdFormat.isEmpty()) { - throw new DnetCollectorException("Param 'mdFormat' is null or empty"); + throw new CollectorException("Param 'mdFormat' is null or empty"); } if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); + throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); + throw new CollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } final Iterator> iters = sets diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index d61f13fb5..e54bae67d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -16,7 +16,7 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; @@ -58,7 +58,7 @@ public class OaiIterator implements Iterator { this.started = true; try { this.token = firstPage(); - } catch (final DnetCollectorException e) { + } catch (final CollectorException e) { throw new RuntimeException(e); } } @@ -80,7 +80,7 @@ public class OaiIterator implements Iterator { while (queue.isEmpty() && token != null && !token.isEmpty()) { try { token = otherPages(token); - } catch (final DnetCollectorException e) { + } catch (final CollectorException e) { throw new RuntimeException(e); } } @@ -92,7 +92,7 @@ public class OaiIterator implements Iterator { public void remove() { } - private String firstPage() throws DnetCollectorException { + private String firstPage() throws CollectorException { try { String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); if (set != null && !set.isEmpty()) { @@ -108,7 +108,7 @@ public class OaiIterator implements Iterator { return downloadPage(url); } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); + throw new CollectorException(e); } } @@ -126,18 +126,18 @@ public class OaiIterator implements Iterator { return result.trim(); } - private String otherPages(final String resumptionToken) throws DnetCollectorException { + private String otherPages(final String resumptionToken) throws CollectorException { try { return downloadPage( baseUrl + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8")); } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); + throw new CollectorException(e); } } - private String downloadPage(final String url) throws DnetCollectorException { + private String downloadPage(final String url) throws CollectorException { final String xml = httpConnector.getInputSource(url); Document doc; @@ -151,7 +151,7 @@ public class OaiIterator implements Iterator { } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { - throw new DnetCollectorException("Error parsing cleaned document:" + cleaned, e1); + throw new CollectorException("Error parsing cleaned document:" + cleaned, e1); } return resumptionToken; } @@ -164,7 +164,7 @@ public class OaiIterator implements Iterator { log.warn("noRecordsMatch for oai call: " + url); return null; } else { - throw new DnetCollectorException(code + " - " + errorNode.getText()); + throw new CollectorException(code + " - " + errorNode.getText()); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java similarity index 56% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java index f40962c21..71d225f13 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java @@ -1,16 +1,16 @@ package eu.dnetlib.dhp.collection.worker; -public class DnetCollectorException extends Exception { +public class CollectorException extends Exception { /** */ private static final long serialVersionUID = -290723075076039757L; - public DnetCollectorException() { + public CollectorException() { super(); } - public DnetCollectorException( + public CollectorException( final String message, final Throwable cause, final boolean enableSuppression, @@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception { super(message, cause, enableSuppression, writableStackTrace); } - public DnetCollectorException(final String message, final Throwable cause) { + public CollectorException(final String message, final Throwable cause) { super(message, cause); } - public DnetCollectorException(final String message) { + public CollectorException(final String message) { super(message); } - public DnetCollectorException(final Throwable cause) { + public CollectorException(final Throwable cause) { super(cause); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java new file mode 100644 index 000000000..380db641a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -0,0 +1,93 @@ + +package eu.dnetlib.dhp.collection.worker; + +import java.io.IOException; +import java.net.URI; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; + +public class CollectorWorker { + + private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class); + + private final CollectorPluginFactory collectorPluginFactory; + + private final ApiDescriptor api; + + private final String hdfsuri; + + private final String hdfsPath; + + public CollectorWorker( + final CollectorPluginFactory collectorPluginFactory, + final ApiDescriptor api, + final String hdfsuri, + final String hdfsPath) { + this.collectorPluginFactory = collectorPluginFactory; + this.api = api; + this.hdfsuri = hdfsuri; + this.hdfsPath = hdfsPath; + + } + + public void collect() throws CollectorException { + try { + final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); + + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + // Get the filesystem - HDFS + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(hdfsPath); + + log.info("Created path " + hdfswritepath.toString()); + + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + plugin + .collect(api) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } catch (Throwable e) { + throw new CollectorException("Error on collecting ", e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java new file mode 100644 index 000000000..5e8d0f9c2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -0,0 +1,55 @@ + +package eu.dnetlib.dhp.collection.worker; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; + +/** + * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module + * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector + * plugin to use and where store the data into HDFS path + * + * @author Sandro La Bruzzo + */ +public class CollectorWorkerApplication { + + private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); + + private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); + + /** + * @param args + */ + public static void main(final String[] args) throws Exception { + + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + CollectorWorker.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collector_parameter.json"))); + argumentParser.parseArgument(args); + + final String hdfsuri = argumentParser.get("namenode"); + + log.info("hdfsURI is {}", hdfsuri); + final String hdfsPath = argumentParser.get("hdfsPath"); + log.info("hdfsPath is {}" + hdfsPath); + final String apiDescriptor = argumentParser.get("apidescriptor"); + log.info("apiDescriptor is {}" + apiDescriptor); + + final ObjectMapper jsonMapper = new ObjectMapper(); + + final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); + + final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, hdfsPath); + worker.collect(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java deleted file mode 100644 index e686ad518..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ /dev/null @@ -1,139 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import java.io.IOException; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; - -public class DnetCollectorWorker { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - - private final CollectorPluginFactory collectorPluginFactory; - - private final ArgumentApplicationParser argumentParser; - - private final MessageManager manager; - - public DnetCollectorWorker( - final CollectorPluginFactory collectorPluginFactory, - final ArgumentApplicationParser argumentParser, - final MessageManager manager) - throws DnetCollectorException { - this.collectorPluginFactory = collectorPluginFactory; - this.argumentParser = argumentParser; - this.manager = manager; - } - - public void collect() throws DnetCollectorException { - try { - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); - - final String hdfsuri = argumentParser.get("namenode"); - - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); - - log.info("Created path " + hdfswritepath.toString()); - - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log - .debug( - "Sending message: " - + manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), - "Collection", - MessageType.ONGOING, - ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - ongoingMap.put("ongoing", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - reportMap.put("collected", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - manager.close(); - } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ", e); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java deleted file mode 100644 index da30e8793..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ /dev/null @@ -1,49 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.MessageManager; - -/** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module - * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector - * plugin to use and where store the data into HDFS path - * - * @author Sandro La Bruzzo - */ -public class DnetCollectorWorkerApplication { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - - private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - - private static ArgumentApplicationParser argumentParser; - - /** @param args */ - public static void main(final String[] args) throws Exception { - - argumentParser = new ArgumentApplicationParser( - IOUtils - .toString( - DnetCollectorWorker.class - .getResourceAsStream( - "/eu/dnetlib/collector/worker/collector_parameter.json"))); - argumentParser.parseArgument(args); - log.info("hdfsPath =" + argumentParser.get("hdfsPath")); - log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = new MessageManager( - argumentParser.get("rabbitHost"), - argumentParser.get("rabbitUser"), - argumentParser.get("rabbitPassword"), - false, - false, - null); - final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); - worker.collect(); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index 7a0028e79..6b070b191 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -3,18 +3,18 @@ package eu.dnetlib.dhp.collection.worker.utils; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.CollectorException; public class CollectorPluginFactory { - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { + public CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException { if (protocol == null) - throw new DnetCollectorException("protocol cannot be null"); + throw new CollectorException("protocol cannot be null"); switch (protocol.toLowerCase().trim()) { case "oai": return new OaiCollectorPlugin(); default: - throw new DnetCollectorException("UNknown protocol"); + throw new CollectorException("UNknown protocol"); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index 5d6108fad..ff3c18aba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -19,7 +19,7 @@ import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.CollectorException; public class HttpConnector { @@ -42,9 +42,9 @@ public class HttpConnector { * * @param requestUrl the URL * @return the content of the downloaded resource - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + * @throws CollectorException when retrying more than maxNumberOfRetry times */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { + public String getInputSource(final String requestUrl) throws CollectorException { return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); } @@ -53,15 +53,15 @@ public class HttpConnector { * * @param requestUrl the URL * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + * @throws CollectorException when retrying more than maxNumberOfRetry times */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { + public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); } private String attemptDownlaodAsString( final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { + throws CollectorException { try { final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); try { @@ -75,16 +75,16 @@ public class HttpConnector { IOUtils.closeQuietly(s); } } catch (final InterruptedException e) { - throw new DnetCollectorException(e); + throw new CollectorException(e); } } private InputStream attemptDownload( final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { + throws CollectorException { if (retryNumber > maxNumberOfRetry) { - throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); + throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList); } log.debug("Downloading " + requestUrl + " - try: " + retryNumber); @@ -144,7 +144,7 @@ public class HttpConnector { return attemptDownload(requestUrl, retryNumber + 1, errorList); } } catch (final InterruptedException e) { - throw new DnetCollectorException(e); + throw new CollectorException(e); } } @@ -173,13 +173,13 @@ public class HttpConnector { } private String obtainNewLocation(final Map> headerMap) - throws DnetCollectorException { + throws CollectorException { for (final String key : headerMap.keySet()) { if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); } } - throw new DnetCollectorException( + throw new CollectorException( "The requested url has been MOVED, but 'location' param is MISSING"); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java index 2c932e40b..45bd844e2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java @@ -1,28 +1,29 @@ + package eu.dnetlib.dhp.transformation; public class DnetTransformationException extends Exception { - public DnetTransformationException() { - super(); - } + public DnetTransformationException() { + super(); + } - public DnetTransformationException( - final String message, - final Throwable cause, - final boolean enableSuppression, - final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } + public DnetTransformationException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } - public DnetTransformationException(final String message, final Throwable cause) { - super(message, cause); - } + public DnetTransformationException(final String message, final Throwable cause) { + super(message, cause); + } - public DnetTransformationException(final String message) { - super(message); - } + public DnetTransformationException(final String message) { + super(message); + } - public DnetTransformationException(final Throwable cause) { - super(cause); - } + public DnetTransformationException(final Throwable cause) { + super(cause); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 6e07e5173..c6ed5a1e3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -9,11 +9,6 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; -import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -30,10 +25,15 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageType; @@ -59,10 +59,9 @@ public class TransformSparkJobNode { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("input"); - final String outputPath = parser.get("output"); + final String inputPath = parser.get("mdstoreInputPath"); + final String outputPath = parser.get("mdstoreOutputPath"); // TODO this variable will be used after implementing Messaging with DNet Aggregator - final String workflowId = parser.get("workflowId"); final String isLookupUrl = parser.get("isLookupUrl"); log.info(String.format("isLookupUrl: %s", isLookupUrl)); @@ -76,24 +75,22 @@ public class TransformSparkJobNode { spark -> transformRecords(parser.getObjectMap(), isLookupService, spark, inputPath, outputPath)); } - - public static void transformRecords(final Mapargs, final ISLookUpService isLookUpService, final SparkSession spark, final String inputPath, final String outputPath) throws DnetTransformationException { + public static void transformRecords(final Map args, final ISLookUpService isLookUpService, + final SparkSession spark, final String inputPath, final String outputPath) throws DnetTransformationException { final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); - final AggregationCounter ct = new AggregationCounter(totalItems, errorItems,transformedItems ); + final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); - final MapFunction XSLTTransformationFunction = TransformationFactory.getTransformationPlugin(args,ct, isLookUpService); + final MapFunction XSLTTransformationFunction = TransformationFactory + .getTransformationPlugin(args, ct, isLookUpService); mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath); - log.info("Transformed item "+ ct.getProcessedItems().count()); - log.info("Total item "+ ct.getTotalItems().count()); - log.info("Transformation Error item "+ ct.getErrorItems().count()); + log.info("Transformed item " + ct.getProcessedItems().count()); + log.info("Total item " + ct.getTotalItems().count()); + log.info("Transformation Error item " + ct.getErrorItems().count()); } - - - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java index 0296458a5..58292139a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -1,62 +1,69 @@ + package eu.dnetlib.dhp.transformation; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.MapFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Map; public class TransformationFactory { - private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); - public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()"; + private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()"; + public static MapFunction getTransformationPlugin( + final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) + throws DnetTransformationException { - public static MapFunction getTransformationPlugin(final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) throws DnetTransformationException { + try { + final String transformationPlugin = jobArgument.get("transformationPlugin"); - try { - final String transformationPlugin = jobArgument.get("transformationPlugin"); + log.info("Transformation plugin required " + transformationPlugin); + switch (transformationPlugin) { + case "XSLT_TRANSFORM": { + final String transformationRuleName = jobArgument.get("transformationRuleTitle"); + if (StringUtils.isBlank(transformationRuleName)) + throw new DnetTransformationException("Missing Parameter transformationRule"); + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); - log.info("Transformation plugin required "+transformationPlugin); - switch (transformationPlugin) { - case "XSLT_TRANSFORM": { - final String transformationRuleName = jobArgument.get("transformationRule"); - if (StringUtils.isBlank(transformationRuleName)) - throw new DnetTransformationException("Missing Parameter transformationRule"); - final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + final String transformationRule = queryTransformationRuleFromIS( + transformationRuleName, isLookupService); - final String transformationRule = queryTransformationRuleFromIS(transformationRuleName, isLookupService); + final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation")); + return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation, + vocabularies); - final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation")); - return new XSLTTransformationFunction(counters,transformationRule,dateOfTransformation,vocabularies); + } + default: + throw new DnetTransformationException( + "transformation plugin does not exists for " + transformationPlugin); - } - default: - throw new DnetTransformationException("transformation plugin does not exists for " + transformationPlugin); + } - } + } catch (Throwable e) { + throw new DnetTransformationException(e); + } + } - } catch (Throwable e) { - throw new DnetTransformationException(e); - } - } - - private static String queryTransformationRuleFromIS(final String transformationRuleName, final ISLookUpService isLookUpService) throws Exception { - final String query = String.format(TRULE_XQUERY, transformationRuleName); - log.info("asking query to IS: "+ query); - List result = isLookUpService.quickSearchProfile(query); - - if (result==null || result.isEmpty()) - throw new DnetTransformationException("Unable to find transformation rule with name: "+ transformationRuleName); - return result.get(0); - } + private static String queryTransformationRuleFromIS(final String transformationRuleName, + final ISLookUpService isLookUpService) throws Exception { + final String query = String.format(TRULE_XQUERY, transformationRuleName); + log.info("asking query to IS: " + query); + List result = isLookUpService.quickSearchProfile(query); + if (result == null || result.isEmpty()) + throw new DnetTransformationException( + "Unable to find transformation rule with name: " + transformationRuleName); + return result.get(0); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 2c6d776af..7b0fdd484 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.transformation.xslt; - import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Qualifier; import net.sf.saxon.s9api.*; @@ -40,6 +39,6 @@ public class Cleaner implements ExtensionFunction, Serializable { Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue); return new XdmAtomicValue( - cleanedValue != null ? cleanedValue.getClassid() : currentValue); + cleanedValue != null ? cleanedValue.getClassid() : currentValue); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index c02b83345..d8707cd76 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -1,66 +1,68 @@ package eu.dnetlib.dhp.transformation.xslt; +import java.io.ByteArrayInputStream; +import java.io.StringWriter; + +import javax.xml.transform.stream.StreamSource; + +import org.apache.spark.api.java.function.MapFunction; + import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import net.sf.saxon.s9api.*; -import org.apache.spark.api.java.function.MapFunction; - -import javax.xml.transform.stream.StreamSource; -import java.io.ByteArrayInputStream; -import java.io.StringWriter; public class XSLTTransformationFunction implements MapFunction { - private final AggregationCounter aggregationCounter; + private final AggregationCounter aggregationCounter; - private final String transformationRule; + private final String transformationRule; - private final Cleaner cleanFunction; + private final Cleaner cleanFunction; - private final long dateOfTransformation; + private final long dateOfTransformation; - public XSLTTransformationFunction( - final AggregationCounter aggregationCounter, - final String transformationRule, - long dateOfTransformation, - final VocabularyGroup vocabularies) - throws Exception { - this.aggregationCounter = aggregationCounter; - this.transformationRule = transformationRule; - this.dateOfTransformation = dateOfTransformation; - cleanFunction = new Cleaner(vocabularies); - } + public XSLTTransformationFunction( + final AggregationCounter aggregationCounter, + final String transformationRule, + long dateOfTransformation, + final VocabularyGroup vocabularies) + throws Exception { + this.aggregationCounter = aggregationCounter; + this.transformationRule = transformationRule; + this.dateOfTransformation = dateOfTransformation; + cleanFunction = new Cleaner(vocabularies); + } - @Override - public MetadataRecord call(MetadataRecord value) { - aggregationCounter.getTotalItems().add(1); - try { - Processor processor = new Processor(false); - processor.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = processor.newXsltCompiler(); - XsltExecutable xslt = comp - .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); - XdmNode source = processor - .newDocumentBuilder() - .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); - XsltTransformer trans = xslt.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = processor.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - final String xml = output.toString(); - value.setBody(xml); - value.setDateOfTransformation(dateOfTransformation); - aggregationCounter.getProcessedItems().add(1); - return value; - } catch (Throwable e) { - aggregationCounter.getErrorItems().add(1); - return null; - } - } + @Override + public MetadataRecord call(MetadataRecord value) { + aggregationCounter.getTotalItems().add(1); + try { + Processor processor = new Processor(false); + processor.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = processor.newXsltCompiler(); + XsltExecutable xslt = comp + .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); + XdmNode source = processor + .newDocumentBuilder() + .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); + XsltTransformer trans = xslt.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = processor.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + final String xml = output.toString(); + value.setBody(xml); + value.setDateOfTransformation(dateOfTransformation); + aggregationCounter.getProcessedItems().add(1); + return value; + } catch (Throwable e) { + aggregationCounter.getErrorItems().add(1); + return null; + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json index 4a6aec5ee..7f5113930 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json @@ -41,46 +41,10 @@ "paramDescription": "the path of the result DataFrame on HDFS", "paramRequired": true }, - { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, { "paramName": "w", "paramLongName": "workflowId", "paramDescription": "the identifier of the dnet Workflow", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json new file mode 100644 index 000000000..901664e0d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json @@ -0,0 +1,6 @@ +[ + {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, + {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, + {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": false} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 3e7f68401..38cd83da7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -1,10 +1,5 @@ - - sequenceFilePath - the path to store the sequence file of the native metadata collected - - mdStorePath the path of the native mdstore @@ -39,72 +34,52 @@ The identifier of the workflow + + ${jobTracker} + ${nameNode} + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - + - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.collection.worker.DnetCollectorWorker - -p${sequenceFilePath} - -a${apiDescription} - -n${nameNode} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -usandro.labruzzo - -w${workflowId} + eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication + --hdfsPath${workingDir}/sequenceFile_${mdstoreVersion} + --apidescriptor${apiDescription} + --namenode${nameNode} + - ${jobTracker} - ${nameNode} yarn cluster - GenerateNativeStoreSparkJob + Generate Native MetadataStore eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --encoding ${metadataEncoding} - --dateOfCollection ${timestamp} - --provenance ${dataSourceInfo} + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --encoding${metadataEncoding} + --dateOfCollection${timestamp} + --provenance${dataSourceInfo} --xpath${identifierPath} - --input${sequenceFilePath} + --input${workingDir}/sequenceFile --output${mdStorePath} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} -w${workflowId} - - - - - - - - diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json deleted file mode 100644 index c247d15e4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, - {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, - {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true}, - {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true}, - {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true}, - {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true} -] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 4b1e3d84b..b36bc3766 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -2,7 +2,7 @@ mdstoreInputPath - the path of the input MDStore + the path of the native MDStore @@ -11,66 +11,57 @@ - transformationRule + transformationRuleTitle The transformation Rule to apply - timestamp - The timestamp of the collection date + transformationPlugin + The transformation Plugin - workflowId - The identifier of the workflow + dateOfTransformation + The timestamp of the transformation date + + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - + - ${jobTracker} - ${nameNode} yarn cluster - MDBuilder + Transform MetadataStore eu.dnetlib.dhp.transformation.TransformSparkJobNode - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --dateOfCollection ${timestamp} - -mt yarn - --input${mdstoreInputPath} - --output${mdstoreOutputPath} - -w${workflowId} - -tr${transformationRule} - -ru${rmq_user} - -rp${rmq_pwd} - -rh${rmq_host} - -ro${rmq_ongoing} - -rr${rmq_report} + dhp-aggregations-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --mdstoreInputPath${mdstoreInputPath} + --mdstoreOutputPath${mdstoreOutputPath} + --dateOfTransformation${dateOfTransformation} + --transformationPlugin${transformationPlugin} + --transformationRuleTitle${transformationRuleTitle} + + - - - - - - - - + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index fd2a96ea0..cbd2f25ab 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -13,28 +13,32 @@ }, { "paramName": "i", - "paramLongName": "input", + "paramLongName": "mdstoreInputPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true }, { "paramName": "o", - "paramLongName": "output", + "paramLongName": "mdstoreOutputPath", "paramDescription": "the path of the result DataFrame on HDFS", "paramRequired": true }, - { - "paramName": "w", - "paramLongName": "workflowId", - "paramDescription": "the identifier of the dnet Workflow", - "paramRequired": true - }, { "paramName": "tr", - "paramLongName": "transformationRule", + "paramLongName": "transformationRuleTitle", "paramDescription": "the transformation Rule to apply to the input MDStore", "paramRequired": true }, + + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the Information System Service LookUp URL", + "paramRequired": true + }, + + + { "paramName": "tp", "paramLongName": "transformationPlugin", diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index c1142ad9c..5c37e9ec3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -6,16 +6,15 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; - import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; +import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; @Disabled public class EXCELParserTest { @@ -31,7 +30,7 @@ public class EXCELParserTest { } @Test - public void test1() throws DnetCollectorException, IOException, InvalidFormatException, ClassNotFoundException, + public void test1() throws CollectorException, IOException, InvalidFormatException, ClassNotFoundException, IllegalAccessException, InstantiationException { EXCELParser excelParser = new EXCELParser(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java index 3b9d1c3ab..f5ef280a0 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java @@ -1,8 +1,6 @@ package eu.dnetlib.dhp.actionmanager.project.httpconnector; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; @@ -11,6 +9,9 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; + @Disabled public class HttpConnectorTest { @@ -31,12 +32,12 @@ public class HttpConnectorTest { @Test - public void testGetInputSource() throws DnetCollectorException { + public void testGetInputSource() throws CollectorException { System.out.println(connector.getInputSource(URL)); } @Test - public void testGoodServers() throws DnetCollectorException { + public void testGoodServers() throws CollectorException { System.out.println(connector.getInputSource(URL_GOODSNI_SERVER)); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index c745219fe..fc19f2064 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -5,17 +5,19 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.*; import java.io.File; +import java.nio.file.Path; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; +import eu.dnetlib.dhp.collection.worker.CollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; @@ -23,43 +25,6 @@ import eu.dnetlib.message.MessageManager; @Disabled public class DnetCollectorWorkerApplicationTests { - private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); - private final MessageManager messageManager = mock(MessageManager.class); - - private DnetCollectorWorker worker; - - @BeforeEach - public void setup() throws Exception { - ObjectMapper mapper = new ObjectMapper(); - final String apiJson = mapper.writeValueAsString(getApi()); - when(argumentParser.get("apidescriptor")).thenReturn(apiJson); - when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); - when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); - when(argumentParser.get("userHDFS")).thenReturn("sandro"); - when(argumentParser.get("workflowId")).thenReturn("sandro"); - when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); - - when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) - .thenAnswer( - a -> { - System.out.println("sent message: " + a.getArguments()[0]); - return true; - }); - when(messageManager.sendMessage(any(Message.class), anyString())) - .thenAnswer( - a -> { - System.out.println("Called"); - return true; - }); - worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); - } - - @AfterEach - public void dropDown() { - File f = new File("/tmp/file.seq"); - f.delete(); - } - @Test public void testFindPlugin() throws Exception { final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); @@ -79,8 +44,14 @@ public class DnetCollectorWorkerApplicationTests { } @Test - public void testFeeding() throws Exception { + public void testFeeding(@TempDir Path testDir) throws Exception { + + System.out.println(testDir.toString()); + CollectorWorker worker = new CollectorWorker(new CollectorPluginFactory(), getApi(), + "file://" + testDir.toString() + "/file.seq", testDir.toString() + "/file.seq"); worker.collect(); + + // TODO create ASSERT HERE } private ApiDescriptor getApi() { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 5479e0b57..6a80e01e2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.transformation; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.lenient; @@ -14,13 +15,13 @@ import java.util.stream.Stream; import javax.xml.transform.stream.StreamSource; -import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -31,8 +32,14 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.collection.CollectionJobTest; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class TransformationJobTest { @@ -49,8 +56,8 @@ public class TransformationJobTest { lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); lenient() - .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) - .thenReturn(synonyms()); + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); } @@ -67,7 +74,6 @@ public class TransformationJobTest { spark.stop(); } - @Test @DisplayName("Test Transform Single XML using XSLTTransformator") public void testTransformSaxonHE() throws Exception { @@ -76,19 +82,15 @@ public class TransformationJobTest { final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - - // We Load the XSLT trasformation Rule from the classpath + // We Load the XSLT transformation Rule from the classpath XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/ext_simple.xsl"); - //Print the record + // Print the record System.out.println(tr.call(mr).getBody()); - //TODO Create significant Assert + // TODO Create significant Assert } - - - @DisplayName("Test TransformSparkJobNode.main") @Test public void transformTest(@TempDir Path testDir) throws Exception { @@ -96,24 +98,44 @@ public class TransformationJobTest { final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); final String mdstore_output = testDir.toString() + "/version"; - - mockupTrasformationRule("simpleTRule","/eu/dnetlib/dhp/transform/ext_simple.xsl"); + mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); // final String arguments = "-issm true -i %s -o %s -d 1 -w 1 -tp XSLT_TRANSFORM -tr simpleTRule"; - final Map parameters = Stream.of(new String[][] { - { "dateOfTransformation", "1234" }, - { "transformationPlugin", "XSLT_TRANSFORM" }, - { "transformationRule", "simpleTRule" }, + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleTitle", "simpleTRule" + }, }).collect(Collectors.toMap(data -> data[0], data -> data[1])); - TransformSparkJobNode.transformRecords(parameters,isLookUpService,spark,mdstore_input, mdstore_output); - - - + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); // TODO introduce useful assertions + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark.read().format("parquet").load(mdstore_output).as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + } @Test @@ -128,27 +150,27 @@ public class TransformationJobTest { Files.deleteIfExists(tempDirWithPrefix); } - - private void mockupTrasformationRule(final String trule, final String path)throws Exception { + private void mockupTrasformationRule(final String trule, final String path) throws Exception { final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); - lenient().when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY,trule))) - .thenReturn(Collections.singletonList(trValue)); + lenient() + .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) + .thenReturn(Collections.singletonList(trValue)); } private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); final LongAccumulator la = new LongAccumulator(); - return new XSLTTransformationFunction(new AggregationCounter(la,la,la),trValue, 0,vocabularies); + return new XSLTTransformationFunction(new AggregationCounter(la, la, la), trValue, 0, vocabularies); } private List vocs() throws IOException { return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); } private List synonyms() throws IOException { return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 42ce7f90b..ac483f10b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -59,17 +59,19 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - r.getAuthor() - .stream() - .filter(Objects::nonNull) - .forEach(a -> { - if (Objects.nonNull(a.getPid())) { - a.getPid() - .stream() - .filter(Objects::nonNull) - .forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)); - } - }); + r + .getAuthor() + .stream() + .filter(Objects::nonNull) + .forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a + .getPid() + .stream() + .filter(Objects::nonNull) + .forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES)); + } + }); } if (value instanceof Publication) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index db1a2ef57..7ff06e428 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -55,9 +55,9 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.VerifyNsPrefixPredicate; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java index 8293faac4..83303ae8e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplicationTest.java @@ -15,8 +15,8 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 5c8e4e4c6..e54fe28aa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -21,8 +21,8 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Dataset; From 99cf3a8ea4980de25f7030acc398ac751ac165ea Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 28 Jan 2021 16:34:46 +0100 Subject: [PATCH 07/86] Merged Datacite transfrom into this branch --- .../dhp/schema/scholexplorer/OafUtils.scala | 4 +- dhp-workflows/dhp-aggregation/pom.xml | 36 +- .../datacite/AbstractRestClient.scala | 73 ++ .../datacite/DataciteAPIImporter.scala | 25 + .../DataciteToOAFTransformation.scala | 475 ++++++++ .../datacite/ExportActionSetJobNode.scala | 41 + .../GenerateDataciteDatasetSpark.scala | 48 + .../datacite/ImportDatacite.scala | 168 +++ .../actionmanager/datacite/datacite_filter | 28 + .../datacite/exportDataset_parameters.json | 21 + .../datacite/generate_dataset_params.json | 33 + .../actionmanager/datacite/hostedBy_map.json | 1032 +++++++++++++++++ .../datacite/import_from_api.json | 27 + .../datacite/oozie_app/config-default.xml | 18 + .../datacite/oozie_app/workflow.xml | 103 ++ .../doiboost/DoiBoostMappingUtil.scala | 4 +- .../java/eu/dnetlib/dhp/export/DLIToOAF.scala | 2 +- 17 files changed, 2132 insertions(+), 6 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala index 27eec77fa..526d65782 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala @@ -15,11 +15,11 @@ object OafUtils { } - def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { + def generateDataInfo(trust: String = "0.9", invisible: Boolean = false): DataInfo = { val di = new DataInfo di.setDeletedbyinference(false) di.setInferred(false) - di.setInvisible(false) + di.setInvisible(invisible) di.setTrust(trust) di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) di diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index cf0fa0efe..0445e0e1b 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -7,10 +7,44 @@ 1.2.4-SNAPSHOT dhp-aggregation - + + + + net.alchim31.maven + scala-maven-plugin + ${net.alchim31.maven.version} + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + org.apache.httpcomponents + httpclient + + org.apache.spark spark-core_2.11 diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala new file mode 100644 index 000000000..852147ccd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -0,0 +1,73 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.apache.commons.io.IOUtils +import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest} +import org.apache.http.entity.StringEntity +import org.apache.http.impl.client.HttpClients + +import java.io.IOException + +abstract class AbstractRestClient extends Iterator[String]{ + + var buffer: List[String] = List() + var current_index:Int = 0 + + var scroll_value: Option[String] = None + + var complete:Boolean = false + + + def extractInfo(input: String): Unit + + protected def getBufferData(): Unit + + + def doHTTPGETRequest(url:String): String = { + val httpGet = new HttpGet(url) + doHTTPRequest(httpGet) + + } + + def doHTTPPOSTRequest(url:String, json:String): String = { + val httpPost = new HttpPost(url) + if (json != null) { + val entity = new StringEntity(json) + httpPost.setEntity(entity) + httpPost.setHeader("Accept", "application/json") + httpPost.setHeader("Content-type", "application/json") + } + doHTTPRequest(httpPost) + } + + def hasNext: Boolean = { + buffer.nonEmpty && current_index < buffer.size + } + + + override def next(): String = { + val next_item:String = buffer(current_index) + current_index = current_index + 1 + if (current_index == buffer.size) + getBufferData() + next_item + } + + + private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ + val client = HttpClients.createDefault + try { + val response = client.execute(r) + IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + throw new RuntimeException("Error on executing request ", e) + } finally try client.close() + catch { + case e: IOException => + throw new RuntimeException("Unable to close client ", e) + } + } + + getBufferData() + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala new file mode 100644 index 000000000..c2ad6855c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala @@ -0,0 +1,25 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import org.json4s.{DefaultFormats, JValue} +import org.json4s.jackson.JsonMethods.{compact, parse, render} + +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10) extends AbstractRestClient { + + override def extractInfo(input: String): Unit = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s))) + val next_url = (json \ "links" \ "next").extractOrElse[String](null) + scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None + if (scroll_value.isEmpty) + complete = true + current_index = 0 + } + + override def getBufferData(): Unit = { + if (!complete) { + val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20*]") + extractInfo(response) + } + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala new file mode 100644 index 000000000..9418e71da --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -0,0 +1,475 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import java.nio.charset.CodingErrorAction +import java.time.LocalDate +import java.time.format.DateTimeFormatter +import java.util.Locale +import java.util.regex.Pattern +import scala.collection.JavaConverters._ +import scala.io.{Codec, Source} + + + +case class DataciteType(doi:String,timestamp:Long,isActive:Boolean, json:String ){} + +case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {} + +case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {} + +case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} + +case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {} + +case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} + +case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {} + +case class DateType(date: Option[String], dateType: Option[String]) {} + +case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {} + +object DataciteToOAFTransformation { + + implicit val codec: Codec = Codec("UTF-8") + codec.onMalformedInput(CodingErrorAction.REPLACE) + codec.onUnmappableCharacter(CodingErrorAction.REPLACE) + + private val PID_VOCABULARY = "dnet:pid_types" + val COBJ_VOCABULARY = "dnet:publication_resource" + val RESULT_VOCABULARY = "dnet:result_typologies" + val ACCESS_MODE_VOCABULARY = "dnet:access_modes" + val DOI_CLASS = "doi" + + val TITLE_SCHEME = "dnet:dataCite_title" + val SUBJ_CLASS = "keywords" + val SUBJ_SCHEME = "dnet:subject_classification_typologies" + + val j_filter:List[String] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString + s.lines.toList + } + + val mapper = new ObjectMapper() + val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) + + val dataInfo: DataInfo = generateDataInfo("0.9") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::datacite", "Datacite") + + val hostedByMap: Map[String, HostedByMapType] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(s) + json.extract[Map[String, HostedByMapType]] + } + + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) + val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + + val funder_regex:List[(Pattern, String)] = List( + (Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda__h2020::"), + (Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE),"40|corda_______::") + + ) + + val Date_regex: List[Pattern] = List( + //Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + //M-D-Y + Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + //D-M-Y + Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ) + + + def filter_json(json:String):Boolean = { + j_filter.exists(f => json.contains(f)) + } + + def toActionSet(item:Oaf) :(String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: OafDataset => + val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] + a.setClazz(classOf[OafDataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case software: Software => + val a: AtomicAction[Software] = new AtomicAction[Software] + a.setClazz(classOf[Software]) + a.setPayload(software) + (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case orp: OtherResearchProduct => + val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] + a.setClazz(classOf[OtherResearchProduct]) + a.setPayload(orp) + (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) + + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + + } + + + + + def embargo_end(embargo_end_date: String): Boolean = { + val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) + val td = LocalDate.now() + td.isAfter(dt) + } + + + def extract_date(input: String): Option[String] = { + val d = Date_regex.map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + } + ).find(s => s != null) + + if (d.isDefined) { + val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get + try { + return Some(LocalDate.parse(a_date, df_en).toString) + } catch { + case _: Throwable => try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => try { + return None + } + } + } + } + d + } + + def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): (Qualifier, Qualifier) = { + if (resourceType != null && resourceType.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceType) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + } + if (schemaOrg != null && schemaOrg.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, schemaOrg) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier(COBJ_VOCABULARY, resourceTypeGeneral) + if (typeQualifier != null) + return (typeQualifier, vocabularies.getSynonymAsQualifier(RESULT_VOCABULARY, typeQualifier.getClassid)) + + } + null + } + + + def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies:VocabularyGroup): Result = { + val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (typeQualifiers == null) + return null + val i = new Instance + i.setInstancetype(typeQualifiers._1) + typeQualifiers._2.getClassname match { + case "dataset" => + val r = new OafDataset + r.setInstance(List(i).asJava) + return r + case "publication" => + val r = new Publication + r.setInstance(List(i).asJava) + return r + case "software" => + val r = new Software + r.setInstance(List(i).asJava) + return r + case "other" => + val r = new OtherResearchProduct + r.setInstance(List(i).asJava) + return r + } + null + } + + + def available_date(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val l: List[String] = for { + JObject(dates) <- json \\ "dates" + JField("dateType", JString(dateTypes)) <- dates + } yield dateTypes + + l.exists(p => p.equalsIgnoreCase("available")) + + } + + + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { + OafMapperUtils.structuredProperty(dt, q, null) + } + + def generateRelation(sourceId:String, targetId:String, relClass:String, cf:KeyValue, di:DataInfo) :Relation = { + + val r = new Relation + r.setSource(sourceId) + r.setTarget(targetId) + r.setRelType("resultProject") + r.setRelClass(relClass) + r.setSubRelType("outcome") + r.setCollectedfrom(List(cf).asJava) + r.setDataInfo(di) + r + + + } + + def get_projectRelation(awardUri:String, sourceId:String):List[Relation] = { + val match_pattern = funder_regex.find(s =>s._1.matcher(awardUri).find()) + + if (match_pattern.isDefined) { + val m =match_pattern.get._1 + val p = match_pattern.get._2 + val grantId = m.matcher(awardUri).replaceAll("$2") + val targetId = s"$p${DHPUtils.md5(grantId)}" + List( + generateRelation(sourceId, targetId,"isProducedBy", DATACITE_COLLECTED_FROM, dataInfo), + generateRelation(targetId, sourceId,"produces", DATACITE_COLLECTED_FROM, dataInfo) + ) + } + else + List() + + } + + + def generateOAF(input:String,ts:Long, dateOfCollection:Long, vocabularies: VocabularyGroup):List[Oaf] = { + if (filter_json(input)) + return List() + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) + val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) + + val doi = (json \ "attributes" \ "doi").extract[String] + if (doi.isEmpty) + return List() + + //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies + val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (result == null) + return List() + + + val doi_q = vocabularies.getSynonymAsQualifier(PID_VOCABULARY, "doi") + val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) + result.setPid(List(pid).asJava) + result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) + result.setOriginalId(List(doi).asJava) + result.setDateofcollection(s"${dateOfCollection}") + result.setDateoftransformation(s"$ts") + result.setDataInfo(dataInfo) + + val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) + + + val authors = creators.zipWithIndex.map { case (c, idx) => + val a = new Author + a.setFullname(c.name.orNull) + a.setName(c.givenName.orNull) + a.setSurname(c.familyName.orNull) + if (c.nameIdentifiers!= null&& c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { + a.setPid(c.nameIdentifiers.get.map(ni => { + val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(PID_VOCABULARY, ni.nameIdentifierScheme.get.toLowerCase()) else null + if (ni.nameIdentifier!= null && ni.nameIdentifier.isDefined) { + OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + } + else + null + + } + ) + .asJava) + } + if (c.affiliation.isDefined) + a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava) + a.setRank(idx + 1) + a + } + + + + + val titles:List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) + + result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils.structuredProperty(t.title.get, "main title", "main title", TITLE_SCHEME, TITLE_SCHEME, null) + } else { + OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, TITLE_SCHEME, TITLE_SCHEME, null) + } + }).asJava) + + if(authors==null || authors.isEmpty || !authors.exists(a => a !=null)) + return List() + result.setAuthor(authors.asJava) + + val dates = (json \\ "dates").extract[List[DateType]] + val publication_year = (json \\ "publicationYear").extractOrElse[String](null) + + val i_date = dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .find(d => d.dateType.get.equalsIgnoreCase("issued")) + .map(d => extract_date(d.date.get)) + val a_date: Option[String] = dates + .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) + .map(d => extract_date(d.date.get)) + .find(d => d != null && d.isDefined) + .map(d => d.get) + + if (a_date.isDefined) { + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + } + if (i_date.isDefined && i_date.get.isDefined) { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } + else if (publication_year != null) { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + + + result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => (d._1.get, vocabularies.getTermAsQualifier("dnet:dataCite_date", d._2.toLowerCase()))) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)).asJava) + + val subjects = (json \\ "subjects").extract[List[SubjectType]] + + result.setSubject(subjects.filter(s => s.subject.nonEmpty) + .map(s => + OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, SUBJ_SCHEME, SUBJ_SCHEME, null) + ).asJava) + + + result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + + val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] + + result.setDescription( + descriptions + .filter(d => d.description.isDefined). + map(d => + OafMapperUtils.field(d.description.get, null) + ).filter(s => s!=null).asJava) + + + val publisher = (json \\ "publisher").extractOrElse[String](null) + if (publisher != null) + result.setPublisher(OafMapperUtils.field(publisher, null)) + + + val language: String = (json \\ "language").extractOrElse[String](null) + + if (language != null) + result.setLanguage(vocabularies.getSynonymAsQualifier("dnet:languages", language)) + + + val instance = result.getInstance().get(0) + + val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String] + + val accessRights:List[String] = for { + JObject(rightsList) <- json \\ "rightsList" + JField("rightsUri", JString(rightsUri)) <- rightsList + } yield rightsUri + + val aRights: Option[Qualifier] = accessRights.map(r => { + vocabularies.getSynonymAsQualifier(ACCESS_MODE_VOCABULARY, r) + }).find(q => q != null) + + + val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier("UNKNOWN", "not available", ACCESS_MODE_VOCABULARY, ACCESS_MODE_VOCABULARY) + + if (client.isDefined) { + val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) + instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) + instance.setCollectedfrom(DATACITE_COLLECTED_FROM) + instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) + instance.setAccessright(access_rights_qualifier) + + //'http') and matches(., '.*(/licenses|/publicdomain|unlicense.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*')]"> + val license = accessRights + .find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*")) + if (license.isDefined) + instance.setLicense(OafMapperUtils.field(license.get, null)) + } + + + val awardUris:List[String] = for { + JObject(fundingReferences) <- json \\ "fundingReferences" + JField("awardUri", JString(awardUri)) <- fundingReferences + } yield awardUri + + val relations:List[Relation] =awardUris.flatMap(a=> get_projectRelation(a, result.getId)).filter(r => r!= null) + + if (relations!= null && relations.nonEmpty) { + List(result):::relations + } + else + List(result) + } + + def generateDataInfo(trust: String): DataInfo = { + val di = new DataInfo + di.setDeletedbyinference(false) + di.setInferred(false) + di.setInvisible(false) + di.setTrust(trust) + di.setProvenanceaction(OafMapperUtils.qualifier("sysimport:actionset", "sysimport:actionset", "dnet:provenanceActions", "dnet:provenanceActions")) + di + } + + def generateDSId(input: String): String = { + val b = StringUtils.substringBefore(input, "::") + val a = StringUtils.substringAfter(input, "::") + s"10|$b::${DHPUtils.md5(a)}" + } + + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala new file mode 100644 index 000000000..9f0d25735 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala @@ -0,0 +1,41 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Oaf +import org.apache.hadoop.io.Text +import org.apache.hadoop.io.compress.GzipCodec +import org.apache.hadoop.mapred.SequenceFileOutputFormat +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object ExportActionSetJobNode { + + val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(ExportActionSetJobNode.getClass.getSimpleName) + .master(master) + .getOrCreate() + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING) + + spark.read.load(sourcePath).as[Oaf] + .map(o =>DataciteToOAFTransformation.toActionSet(o)) + .filter(o => o!= null) + .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) + + + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala new file mode 100644 index 000000000..6837e94b2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -0,0 +1,48 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.model.mdstore.MetadataRecord +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.io.Source + +object GenerateDataciteDatasetSpark { + + val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) + + def main(args: Array[String]): Unit = { + val conf = new SparkConf + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val targetPath = parser.get("targetPath") + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + + val spark: SparkSession = SparkSession.builder().config(conf) + .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) + .master(master) + .getOrCreate() + + implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] + + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + + import spark.implicits._ + + spark.read.load(sourcePath).as[DataciteType] + .filter(d => d.isActive) + .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies)) + .filter(d => d != null) + .write.mode(SaveMode.Overwrite).save(targetPath) + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala new file mode 100644 index 000000000..06fcbb518 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -0,0 +1,168 @@ +package eu.dnetlib.dhp.actionmanager.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.hadoop.io.{IntWritable, SequenceFile, Text} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import org.apache.spark.sql.functions.max +import org.slf4j.{Logger, LoggerFactory} + +import java.time.format.DateTimeFormatter._ +import java.time.{LocalDateTime, ZoneOffset} +import scala.io.Source + +object ImportDatacite { + + val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) + + + def convertAPIStringToDataciteItem(input:String): DataciteType = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase + + val isActive = (json \ "attributes" \ "isActive").extract[Boolean] + + val timestamp_string = (json \ "attributes" \ "updated").extract[String] + val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) + DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli/1000, isActive = isActive, json = input) + + } + + + + def main(args: Array[String]): Unit = { + + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) + parser.parseArgument(args) + val master = parser.get("master") + + val hdfsuri = parser.get("namenode") + log.info(s"namenode is $hdfsuri") + + val targetPath = parser.get("targetPath") + log.info(s"targetPath is $targetPath") + + val dataciteDump = parser.get("dataciteDumpPath") + log.info(s"dataciteDump is $dataciteDump") + + val hdfsTargetPath =new Path(targetPath) + log.info(s"hdfsTargetPath is $hdfsTargetPath") + + val spark: SparkSession = SparkSession.builder() + .appName(ImportDatacite.getClass.getSimpleName) + .master(master) + .getOrCreate() + + // ====== Init HDFS File System Object + val conf = new Configuration + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri) + + // Because of Maven + conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) + conf.set("fs.file.impl", classOf[LocalFileSystem].getName) + val sc:SparkContext = spark.sparkContext + sc.setLogLevel("ERROR") + + import spark.implicits._ + + + val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { + + override def zero: DataciteType = null + + override def reduce(a: DataciteType, b: DataciteType): DataciteType = { + if (b == null) + return a + if (a == null) + return b + if(a.timestamp >b.timestamp) { + return a + } + b + } + + override def merge(a: DataciteType, b: DataciteType): DataciteType = { + reduce(a,b) + } + + override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def finish(reduction: DataciteType): DataciteType = reduction + } + + val dump:Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] + val ts = dump.select(max("timestamp")).first().getLong(0) + + log.info(s"last Timestamp is $ts") + + val cnt = writeSequenceFile(hdfsTargetPath, ts, conf) + + log.info(s"Imported from Datacite API $cnt documents") + + if (cnt > 0) { + + val inputRdd:RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) + .map(s => s._2.toString) + .map(s => convertAPIStringToDataciteItem(s)) + spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") + + val ds:Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] + + dump + .union(ds) + .groupByKey(_.doi) + .agg(dataciteAggregator.toColumn) + .map(s=>s._2) + .repartition(4000) + .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") + + val fs = FileSystem.get(sc.hadoopConfiguration) + fs.delete(new Path(s"$dataciteDump"), true) + fs.rename(new Path(s"${dataciteDump}_updated"),new Path(s"$dataciteDump")) + } + } + + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration):Long = { + val client = new DataciteAPIImporter(timestamp*1000, 1000) + var i = 0 + try { + val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) + try { + + var start: Long = System.currentTimeMillis + var end: Long = 0 + val key: IntWritable = new IntWritable(i) + val value: Text = new Text + while ( { + client.hasNext + }) { + key.set({ + i += 1; + i - 1 + }) + value.set(client.next()) + writer.append(key, value) + writer.hflush() + if (i % 1000 == 0) { + end = System.currentTimeMillis + val time = (end - start) / 1000.0F + println(s"Imported $i in $time seconds") + start = System.currentTimeMillis + } + } + } finally if (writer != null) writer.close() + } + i + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter new file mode 100644 index 000000000..ad80d6998 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter @@ -0,0 +1,28 @@ +TUBYDI - Assistir Filmes e Series Online Grátis +123Movies +WATCH FULL MOVIE +Movierulz +Full Movie Online +MOVIé WatcH +The King of Staten Island 2020 Online For Free +Watch Train to Busan 2 2020 online for free +Sixth Sense Movie Novelization +Film Complet streaming vf gratuit en ligne +watch now free +LIVE stream watch +LIVE stream UFC +RBC Heritage live stream +MLBStreams Free +NFL Live Stream +Live Stream Free +Royal Ascot 2020 Live Stream +TV Shows Full Episodes Official +FuboTV +Gomovies +Online Free Trial Access +123watch +DÜŞÜK HAPI +Bebek Düşürme Yöntemleri +WHATSAP İLETİŞİM +Cytotec +düşük hapı \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json new file mode 100644 index 000000000..63e080337 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json new file mode 100644 index 000000000..34fa3ed99 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json @@ -0,0 +1,33 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "tr", + "paramLongName": "transformationRule", + "paramDescription": "the transformation Rule", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the isLookup URL", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json new file mode 100644 index 000000000..d014dab5a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json @@ -0,0 +1,1032 @@ +{ + "SND.QOG": { + "openaire_id": "re3data_____::r3d100012231", + "datacite_name": "Quality of Government Institute", + "official_name": "Quality of Government Institute's Data", + "similarity": 0.8985507246376812 + }, + "GESIS.CESSDA": { + "openaire_id": "re3data_____::r3d100010202", + "datacite_name": "CESSDA ERIC", + "official_name": "CESSDA ERIC" + }, + "BL.CRAN": { + "openaire_id": "re3data_____::r3d100012068", + "datacite_name": "Cranfield University", + "official_name": "Cranfield Online Research Data" + }, + "SUL.OPENNEURO": { + "openaire_id": "re3data_____::r3d100010924", + "datacite_name": "OpenNeuro", + "official_name": "OpenNeuro" + }, + "UNAVCO.UNAVCO": { + "openaire_id": "re3data_____::r3d100010872", + "datacite_name": "UNAVCO", + "official_name": "UNAVCO" + }, + "SUL.SDR": { + "openaire_id": "re3data_____::r3d100010710", + "datacite_name": "Stanford Digital Repository", + "official_name": "Stanford Digital Repository" + }, + "DK.ICES": { + "openaire_id": "re3data_____::r3d100011288", + "datacite_name": "International Council for the Exploration of the Sea (ICES)", + "official_name": "International Council for the Exploration of the Sea datasets", + "similarity": 0.8833333333333333 + }, + "CISTI.DFOSCIMR": { + "openaire_id": "re3data_____::r3d100012039", + "datacite_name": "Bedford Institute of Oceanography - Fisheries and Oceans Canada - Ocean Data and Information Section", + "official_name": "Bedford Institute of Oceanography - Oceanographic Databases" + }, + "CSIC.DIGITAL": { + "openaire_id": "re3data_____::r3d100011076", + "datacite_name": "Digital CSIC", + "official_name": "DIGITAL.CSIC" + }, + "TIB.PANGAEA": { + "openaire_id": "re3data_____::r3d100010134", + "datacite_name": "PANGAEA", + "official_name": "PANGAEA" + }, + "PSU.DATACOM": { + "openaire_id": "re3data_____::r3d100010477", + "datacite_name": "Data Commons", + "official_name": "ANU Data Commons", + "similarity": 0.8571428571428571 + }, + "ANDS.CENTRE72": { + "openaire_id": "re3data_____::r3d100010451", + "datacite_name": "PARADISEC", + "official_name": "Pacific and Regional Archive for Digital Sources in Endangered Cultures" + }, + "BL.OXDB": { + "openaire_id": "re3data_____::r3d100011653", + "datacite_name": "Oxford University Library Service Databank", + "official_name": "DataBank, Bodleian Libraries, University of Oxford" + }, + "BL.STANDREW": { + "openaire_id": "re3data_____::r3d100012411", + "datacite_name": "University of St Andrews", + "official_name": "St Andrews Research portal - Research Data" + }, + "TIB.BAFG": { + "openaire_id": "re3data_____::r3d100011664", + "datacite_name": "Bundesanstalt f\u00fcr Gew\u00e4sserkunde", + "official_name": "Geoportal der BFG" + }, + "CRUI.UNIBO": { + "openaire_id": "re3data_____::r3d100012604", + "datacite_name": "Universit\u00e0 degli Studi di Bologna", + "official_name": "AMS Acta" + }, + "GDCC.ODUM-LIBRARY": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "UNC Libraries", + "official_name": "UNC Dataverse" + }, + "RG.RG": { + "openaire_id": "re3data_____::r3d100012227", + "datacite_name": "ResearchGate", + "official_name": "ResearchGate" + }, + "TIB.EUMETSAT": { + "openaire_id": "re3data_____::r3d100010232", + "datacite_name": "EUMETSAT", + "official_name": "Eumetsat" + }, + "SND.SMHI": { + "openaire_id": "re3data_____::r3d100011776", + "datacite_name": "Swedish Meteorological and Hydrological Institute open data", + "official_name": "Swedish Meteorological and Hydrological Institute open data" + }, + "NOAA.NCEI": { + "openaire_id": "re3data_____::r3d100011801", + "datacite_name": "National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI)", + "official_name": "NCEI" + }, + "TIB.WDCC": { + "openaire_id": "re3data_____::r3d100010299", + "datacite_name": "World Data Center for Climate", + "official_name": "World Data Center for Climate" + }, + "CNGB.GIGADB": { + "openaire_id": "re3data_____::r3d100010478", + "datacite_name": "GigaDB", + "official_name": "GigaDB" + }, + "DELFT.VLIZ": { + "openaire_id": "re3data_____::r3d100010661", + "datacite_name": "Vlaams Instituut voor de Zee", + "official_name": "Flanders Marine Institute" + }, + "NUS.SB": { + "openaire_id": "re3data_____::r3d100012564", + "datacite_name": "National University of Singapore", + "official_name": "ScholarBank@NUS" + }, + "EDI.EDI": { + "openaire_id": "re3data_____::r3d100010272", + "datacite_name": "Environmental Data Initiative", + "official_name": "Environmental Data Initiative Repository" + }, + "INIST.ADISP": { + "openaire_id": "re3data_____::r3d100010494", + "datacite_name": "Quetelet PROGEDO Diffusion", + "official_name": "Quetelet PROGEDO Diffusion" + }, + "GESIS.SHARE": { + "openaire_id": "re3data_____::r3d100010430", + "datacite_name": "SHARE - ERIC", + "official_name": "Survey of Health, Ageing and Retirement in Europe" + }, + "ANDS.CENTRE-1": { + "openaire_id": "re3data_____::r3d100010864", + "datacite_name": "Griffith University", + "official_name": "Griffith University Research Data Repository" + }, + "BL.READING": { + "openaire_id": "re3data_____::r3d100012064", + "datacite_name": "University of Reading", + "official_name": "University of Reading Research Data Archive" + }, + "CORNELL.CISER": { + "openaire_id": "re3data_____::r3d100011056", + "datacite_name": "CISER Data Archive", + "official_name": "CISER Data Archive" + }, + "DRYAD.DRYAD": { + "openaire_id": "re3data_____::r3d100000044", + "datacite_name": "DRYAD", + "official_name": "DRYAD" + }, + "CDL.PISCO": { + "openaire_id": "re3data_____::r3d100010947", + "datacite_name": "Partnership for Interdisciplinary Studies of Coastal Oceans (PISCO)", + "official_name": "Partnership for Interdisciplinary Studies of Coastal Oceans" + }, + "IEEE.DATAPORT": { + "openaire_id": "re3data_____::r3d100012569", + "datacite_name": "IEEE DataPort", + "official_name": "IEEE DataPort" + }, + "DELFT.MAASTRO": { + "openaire_id": "re3data_____::r3d100011086", + "datacite_name": "MAASTRO Clinic", + "official_name": "CancerData.org" + }, + "USGS.PROD": { + "openaire_id": "re3data_____::r3d100010054", + "datacite_name": "USGS DOI Tool Production Environment", + "official_name": "U.S. Geological Survey" + }, + "GDCC.ODUM-DV": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "Odum Institute Dataverse", + "official_name": "UNC Dataverse" + }, + "CDL.SDSCSG": { + "openaire_id": "re3data_____::r3d100011690", + "datacite_name": "UCSD Signaling Gateway", + "official_name": "UCSD Signaling gateway" + }, + "ORBIS.NKN": { + "openaire_id": "re3data_____::r3d100011587", + "datacite_name": "Northwest Knowledge Network", + "official_name": "Northwest Knowledge Network" + }, + "ANDS.CENTRE63": { + "openaire_id": "re3data_____::r3d100010918", + "datacite_name": "Test: Atlas of Living Australia", + "official_name": "Atlas of Living Australia", + "similarity": 0.8928571428571429 + }, + "SML.TALKBANK": { + "openaire_id": "re3data_____::r3d100010887", + "datacite_name": "TalkBank", + "official_name": "TalkBank" + }, + "CORNELL.LIBRARY": { + "openaire_id": "re3data_____::r3d100012322", + "datacite_name": "Cornell University Library", + "official_name": "eCommons - Cornell's digital repository" + }, + "BL.SOTON": { + "openaire_id": "re3data_____::r3d100011245", + "datacite_name": "University of Southampton", + "official_name": "University of Southampton Institutional Research Repository" + }, + "GESIS.DB-BANK": { + "openaire_id": "re3data_____::r3d100012252", + "datacite_name": "Forschungsdaten- und Servicezentrum der Bundesbank", + "official_name": "Forschungsdaten- und Servicezentrum der Bundesbank" + }, + "ANDS.CENTRE68": { + "openaire_id": "re3data_____::r3d100010918", + "datacite_name": "Atlas of Living Australia", + "official_name": "Atlas of Living Australia" + }, + "ANDS.CENTRE69": { + "openaire_id": "re3data_____::r3d100010914", + "datacite_name": "Australian Ocean Data Network", + "official_name": "Australian Ocean Data Network Portal" + }, + "INIST.CDS": { + "openaire_id": "re3data_____::r3d100010584", + "datacite_name": "Strasbourg Astronomical Data Center", + "official_name": "Strasbourg Astronomical Data Center" + }, + "BL.NHM": { + "openaire_id": "re3data_____::r3d100011675", + "datacite_name": "Natural History Museum, London", + "official_name": "Natural History Museum, Data Portal" + }, + "BL.ADS": { + "openaire_id": "re3data_____::r3d100000006", + "datacite_name": "Archaeology Data Service", + "official_name": "Archaeology Data Service" + }, + "GDCC.JHU": { + "openaire_id": "re3data_____::r3d100011836", + "datacite_name": "Johns Hopkins University Library", + "official_name": "Johns Hopkins Data Archive Dataverse Network" + }, + "BL.ED": { + "openaire_id": "re3data_____::r3d100000047", + "datacite_name": "University of Edinburgh", + "official_name": "Edinburgh DataShare" + }, + "BL.EXETER": { + "openaire_id": "re3data_____::r3d100011202", + "datacite_name": "University of Exeter", + "official_name": "Open Research Exeter" + }, + "BL.NCL": { + "openaire_id": "re3data_____::r3d100012408", + "datacite_name": "Newcastle University", + "official_name": "NCL Data" + }, + "BROWN.BDR": { + "openaire_id": "re3data_____::r3d100011654", + "datacite_name": "Brown Digital Repository", + "official_name": "Brown Digital Repository" + }, + "GDCC.SYR-QDR": { + "openaire_id": "re3data_____::r3d100011038", + "datacite_name": "Syracuse University Qualitative Data Repository", + "official_name": "Qualitative Data Repository" + }, + "BL.BRISTOL": { + "openaire_id": "re3data_____::r3d100011099", + "datacite_name": "University of Bristol", + "official_name": "data.bris Research Data Repository" + }, + "DATACITE.DATACITE": { + "openaire_id": "openaire____::datacite", + "datacite_name": "DataCite", + "official_name": "Datacite" + }, + "ESTDOI.KEEL": { + "openaire_id": "re3data_____::r3d100011941", + "datacite_name": "Keeleressursid. The Center of Estonian Language Resources", + "official_name": "Center of Estonian Language Resources" + }, + "BL.ESSEX": { + "openaire_id": "re3data_____::r3d100012405", + "datacite_name": "University of Essex", + "official_name": "Research Data at Essex" + }, + "PURDUE.MDF": { + "openaire_id": "re3data_____::r3d100012080", + "datacite_name": "Univ Chicago Materials Data Facility", + "official_name": "Materials Data Facility" + }, + "DELFT.KNMI": { + "openaire_id": "re3data_____::r3d100011879", + "datacite_name": "KNMI Data Centre", + "official_name": "KNMI Data Centre" + }, + "CUL.CIESIN": { + "openaire_id": "re3data_____::r3d100010207", + "datacite_name": "Center for International Earth Science Information Network", + "official_name": "Center for International Earth Science Information Network" + }, + "WISC.NEOTOMA": { + "openaire_id": "re3data_____::r3d100011761", + "datacite_name": "Neotoma Paleoecological Database", + "official_name": "Neotoma Paleoecology Database", + "similarity": 0.9180327868852459 + }, + "IRIS.IRIS": { + "openaire_id": "re3data_____::r3d100010268", + "datacite_name": "Incorporated Research Institutions for Seismology", + "official_name": "Incorporated Research Institutions for Seismology" + }, + "ANDS.CENTRE50": { + "openaire_id": "re3data_____::r3d100012378", + "datacite_name": "Analysis and Policy Observatory", + "official_name": "Analysis and Policy Observatory" + }, + "FAO.RING": { + "openaire_id": "re3data_____::r3d100012571", + "datacite_name": "CIARD RING", + "official_name": "CIARD Ring" + }, + "CUL.R2R": { + "openaire_id": "re3data_____::r3d100010735", + "datacite_name": "Rolling Deck to Repository", + "official_name": "Rolling Deck to Repository" + }, + "DEMO.GRIIDC": { + "openaire_id": "re3data_____::r3d100011571", + "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative", + "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative" + }, + "ANDS.CENTRE-6": { + "openaire_id": "re3data_____::r3d100012268", + "datacite_name": "Curtin University", + "official_name": "Curtin University Research Data Collection" + }, + "ANDS.CENTRE-5": { + "openaire_id": "re3data_____::r3d100012013", + "datacite_name": "TERN Central Portal", + "official_name": "TERN Data Discovery portal" + }, + "FIGSHARE.UCT": { + "openaire_id": "re3data_____::r3d100012633", + "datacite_name": "University of Cape Town (UCT)", + "official_name": "ZivaHub" + }, + "BIBSYS.UIT-ORD": { + "openaire_id": "re3data_____::r3d100012538", + "datacite_name": "DataverseNO", + "official_name": "DataverseNO" + }, + "CISTI.CADC": { + "openaire_id": "re3data_____::r3d100000016", + "datacite_name": "Canadian Astronomy Data Centre", + "official_name": "The Canadian Astronomy Data Centre", + "similarity": 0.9375 + }, + "BL.CCDC": { + "openaire_id": "re3data_____::r3d100010197", + "datacite_name": "The Cambridge Crystallographic Data Centre", + "official_name": "The Cambridge Structural Database" + }, + "BL.UCLD": { + "openaire_id": "re3data_____::r3d100012417", + "datacite_name": "University College London", + "official_name": "UCL Discovery" + }, + "GESIS.RKI": { + "openaire_id": "re3data_____::r3d100010436", + "datacite_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute", + "official_name": "'Health Monitoring' Research Data Centre at the Robert Koch Institute" + }, + "BL.DRI": { + "openaire_id": "re3data_____::r3d100011805", + "datacite_name": "Digital Repository of Ireland", + "official_name": "Digital Repository of Ireland" + }, + "TIB.KIT-IMK": { + "openaire_id": "re3data_____::r3d100011956", + "datacite_name": "Institute for Meteorology and Climate Research - Atmospheric Trace Gases and Remote Sensing", + "official_name": "CARIBIC" + }, + "DOINZ.LANDCARE": { + "openaire_id": "re3data_____::r3d100011662", + "datacite_name": "Landcare Research New Zealand Ltd", + "official_name": "Landcare Research Data Repository" + }, + "DEMO.EMORY": { + "openaire_id": "re3data_____::r3d100011559", + "datacite_name": "The Cancer Imaging Archive", + "official_name": "The Cancer Imaging Archive" + }, + "UMN.DRUM": { + "openaire_id": "re3data_____::r3d100011393", + "datacite_name": "Data Repository for the University of Minnesota", + "official_name": "Data Repository for the University of Minnesota" + }, + "CISTI.SFU": { + "openaire_id": "re3data_____::r3d100012512", + "datacite_name": "Simon Fraser University", + "official_name": "SFU Radar" + }, + "GESIS.ICPSR": { + "openaire_id": "re3data_____::r3d100010255", + "datacite_name": "ICPSR", + "official_name": "Inter-university Consortium for Political and Social Research" + }, + "ANDS.CENTRE49": { + "openaire_id": "re3data_____::r3d100012145", + "datacite_name": "The University of Melbourne", + "official_name": "melbourne.figshare.com" + }, + "ZBW.IFO": { + "openaire_id": "re3data_____::r3d100010201", + "datacite_name": "LMU-ifo Economics & Business Data Center", + "official_name": "LMU-ifo Economics & Business Data Center" + }, + "TIB.BEILST": { + "openaire_id": "re3data_____::r3d100012329", + "datacite_name": "Beilstein-Institut zur F\u00f6rderung der Chemischen Wissenschaften", + "official_name": "STRENDA DB" + }, + "ZBW.ZBW-JDA": { + "openaire_id": "re3data_____::r3d100012190", + "datacite_name": "ZBW Journal Data Archive", + "official_name": "ZBW Journal Data Archive" + }, + "BL.UKDA": { + "openaire_id": "re3data_____::r3d100010215", + "datacite_name": "UK Data Archive", + "official_name": "UK Data Archive" + }, + "CERN.INSPIRE": { + "openaire_id": "re3data_____::r3d100011077", + "datacite_name": "inspirehep.net", + "official_name": "Inspire-HEP" + }, + "CISTI.OTNDC": { + "openaire_id": "re3data_____::r3d100012083", + "datacite_name": "Ocean Tracking Network", + "official_name": "Ocean Tracking Network" + }, + "CISTI.CC": { + "openaire_id": "re3data_____::r3d100012646", + "datacite_name": "Compute Canada", + "official_name": "Federated Research Data Repository" + }, + "SND.ICOS": { + "openaire_id": "re3data_____::r3d100012203", + "datacite_name": "ICOS Carbon Portal", + "official_name": "ICOS Carbon Portal" + }, + "BL.MENDELEY": { + "openaire_id": "re3data_____::r3d100011868", + "datacite_name": "Mendeley", + "official_name": "Mendeley Data" + }, + "DELFT.UU": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "Universiteit Utrecht", + "official_name": "DataverseNL" + }, + "GESIS.DSZ-BO": { + "openaire_id": "re3data_____::r3d100010439", + "datacite_name": "Data Service Center for Business and Organizational Data", + "official_name": "Data Service Center for Business and Organizational Data" + }, + "TIB.IPK": { + "openaire_id": "re3data_____::r3d100011647", + "datacite_name": "IPK Gatersleben", + "official_name": "IPK Gatersleben" + }, + "GDCC.HARVARD-DV": { + "openaire_id": "re3data_____::r3d100010051", + "datacite_name": "Harvard IQSS Dataverse", + "official_name": "Harvard Dataverse" + }, + "BL.LEEDS": { + "openaire_id": "re3data_____::r3d100011945", + "datacite_name": "University of Leeds", + "official_name": "Research Data Leeds Repository" + }, + "BL.BRUNEL": { + "openaire_id": "re3data_____::r3d100012140", + "datacite_name": "Brunel University London", + "official_name": "Brunel figshare" + }, + "DEMO.ENVIDAT": { + "openaire_id": "re3data_____::r3d100012587", + "datacite_name": "EnviDat", + "official_name": "EnviDat" + }, + "GDCC.NTU": { + "openaire_id": "re3data_____::r3d100012440", + "datacite_name": "Nanyang Technological University", + "official_name": "DR-NTU (Data)" + }, + "UNM.DATAONE": { + "openaire_id": "re3data_____::r3d100000045", + "datacite_name": "DataONE", + "official_name": "DataONE" + }, + "CSC.NRD": { + "openaire_id": "re3data_____::r3d100012157", + "datacite_name": "Ministry of Culture and Education", + "official_name": "IDA Research Data Storage Service" + }, + "GESIS.DIPF": { + "openaire_id": "re3data_____::r3d100010390", + "datacite_name": "Research Data Centre for Education", + "official_name": "Research Data Centre for Education" + }, + "BL.HALLAM": { + "openaire_id": "re3data_____::r3d100011909", + "datacite_name": "Sheffield Hallam University", + "official_name": "Sheffield Hallam University Research Data Archive" + }, + "BL.LSHTM": { + "openaire_id": "re3data_____::r3d100011800", + "datacite_name": "London School of Hygiene and Tropical Medicine", + "official_name": "LSHTM Data Compass" + }, + "SUBGOE.DARIAH": { + "openaire_id": "re3data_____::r3d100011345", + "datacite_name": "Digital Research Infrastructure for the Arts and Humanities", + "official_name": "DARIAH-DE Repository" + }, + "SND.SU": { + "openaire_id": "re3data_____::r3d100012147", + "datacite_name": "Stockholm University", + "official_name": "Stockholm University repository for data" + }, + "GESIS.INDEPTH": { + "openaire_id": "re3data_____::r3d100011392", + "datacite_name": "INDEPTH Network", + "official_name": "INDEPTH Data Repository" + }, + "TIB.FLOSS": { + "openaire_id": "re3data_____::r3d100010863", + "datacite_name": "FLOSS Project, Syracuse University", + "official_name": "FLOSSmole" + }, + "ETHZ.WGMS": { + "openaire_id": "re3data_____::r3d100010627", + "datacite_name": "World Glacier Monitoring Service", + "official_name": "World Glacier Monitoring Service" + }, + "BL.UEL": { + "openaire_id": "re3data_____::r3d100012414", + "datacite_name": "University of East London", + "official_name": "Data.uel" + }, + "DELFT.DATA4TU": { + "openaire_id": "re3data_____::r3d100010216", + "datacite_name": "4TU.Centre for Research Data", + "official_name": "4TU.Centre for Research Data" + }, + "GESIS.IANUS": { + "openaire_id": "re3data_____::r3d100012361", + "datacite_name": "IANUS - FDZ Arch\u00e4ologie & Altertumswissenschaften", + "official_name": "IANUS Datenportal" + }, + "CDL.UCSDCCA": { + "openaire_id": "re3data_____::r3d100011655", + "datacite_name": "California Coastal Atlas", + "official_name": "California Coastal Atlas" + }, + "VIVA.VT": { + "openaire_id": "re3data_____::r3d100012601", + "datacite_name": "Virginia Tech", + "official_name": "VTechData" + }, + "ANDS.CENTRE39": { + "openaire_id": "re3data_____::r3d100011640", + "datacite_name": "University of the Sunshine Coast", + "official_name": "USC Research Bank research data" + }, + "DEMO.OPENKIM": { + "openaire_id": "re3data_____::r3d100011864", + "datacite_name": "OpenKIM", + "official_name": "OpenKIM" + }, + "INIST.OTELO": { + "openaire_id": "re3data_____::r3d100012505", + "datacite_name": "Observatoire Terre Environnement de Lorraine", + "official_name": "ORDaR" + }, + "INIST.ILL": { + "openaire_id": "re3data_____::r3d100012072", + "datacite_name": "Institut Laue-Langevin", + "official_name": "ILL Data Portal" + }, + "ANDS.CENTRE31": { + "openaire_id": "re3data_____::r3d100012378", + "datacite_name": "Test: Analysis and Policy Observatory", + "official_name": "Analysis and Policy Observatory", + "similarity": 0.9117647058823529 + }, + "ANDS.CENTRE30": { + "openaire_id": "re3data_____::r3d100010917", + "datacite_name": "Test: Geoscience Australia", + "official_name": "Geoscience Australia", + "similarity": 0.8695652173913043 + }, + "BL.SALFORD": { + "openaire_id": "re3data_____::r3d100012144", + "datacite_name": "University of Salford", + "official_name": "University of Salford Data Repository" + }, + "CERN.HEPDATA": { + "openaire_id": "re3data_____::r3d100010081", + "datacite_name": "HEPData.net", + "official_name": "HEPData" + }, + "ETHZ.E-COLL": { + "openaire_id": "re3data_____::r3d100012557", + "datacite_name": "ETH Z\u00fcrich Research Collection", + "official_name": "ETH Z\u00fcrich Research Collection" + }, + "GBIF.GBIF": { + "openaire_id": "re3data_____::r3d100000039", + "datacite_name": "Global Biodiversity Information Facility", + "official_name": "Global Biodiversity Information Facility" + }, + "ORNLDAAC.DAAC": { + "openaire_id": "re3data_____::r3d100000037", + "datacite_name": "Oak Ridge National Laboratory Distributed Active Archive Center", + "official_name": "Oak Ridge National Laboratory Distributed Active Archive Center for Biogeochemical Dynamics" + }, + "KAUST.KAUSTREPO": { + "openaire_id": "re3data_____::r3d100011898", + "datacite_name": "KAUST Research Repository", + "official_name": "UWA Research Repository", + "similarity": 0.875 + }, + "ZBW.ZEW": { + "openaire_id": "re3data_____::r3d100010399", + "datacite_name": "Zentrum f\u00fcr Europ\u00e4ische Wirtschaftsforschung GmbH (ZEW)", + "official_name": "ZEW Forschungsdatenzentrum" + }, + "SML.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "Digital Antiquity (TDAR)", + "official_name": "tDAR" + }, + "GESIS.CSDA": { + "openaire_id": "re3data_____::r3d100010484", + "datacite_name": "Czech Social Science Data Archive", + "official_name": "Czech Social Science Data Archive" + }, + "SND.BOLIN": { + "openaire_id": "re3data_____::r3d100011699", + "datacite_name": "Bolin Centre Database", + "official_name": "Bolin Centre Database" + }, + "MLA.HC": { + "openaire_id": "re3data_____::r3d100012309", + "datacite_name": "Humanities Commons", + "official_name": "Humanities Commons" + }, + "CDL.IDASHREP": { + "openaire_id": "re3data_____::r3d100010382", + "datacite_name": "iDASH Repository", + "official_name": "IDS Repository", + "similarity": 0.8666666666666667 + }, + "ZBMED.SNSB": { + "openaire_id": "re3data_____::r3d100011873", + "datacite_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns", + "official_name": "Staatliche Naturwissenschaftliche Sammlungen Bayerns - datasets", + "similarity": 0.9043478260869565 + }, + "ORBIS.OHSU": { + "openaire_id": "re3data_____::r3d100012244", + "datacite_name": "Oregon Health Sciences University", + "official_name": "OHSU Digital Commons" + }, + "DARTLIB.CRAWDAD": { + "openaire_id": "re3data_____::r3d100010716", + "datacite_name": "CRAWDAD", + "official_name": "CRAWDAD" + }, + "CDL.CCHDO": { + "openaire_id": "re3data_____::r3d100010831", + "datacite_name": "CLIVAR and Carbon Hydrographic Data Office", + "official_name": "Climate Variability and Predictability and Carbon Hydrographic Data Office" + }, + "GESIS.AUSSDA": { + "openaire_id": "re3data_____::r3d100010483", + "datacite_name": "Austrian Social Science Data Archive", + "official_name": "AUSSDA" + }, + "NSIDC.DATACTR": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "TIB.RADAR": { + "openaire_id": "re3data_____::r3d100012330", + "datacite_name": "FIZ Karlsruhe \u2013 Leibniz-Institut f\u00fcr Informationsinfrastruktur", + "official_name": "RADAR" + }, + "KIM.OPENKIM": { + "openaire_id": "re3data_____::r3d100011864", + "datacite_name": "Open Knowledgebase of Interatomic Models (OpenKIM)", + "official_name": "OpenKIM" + }, + "BL.LBORO": { + "openaire_id": "re3data_____::r3d100012143", + "datacite_name": "Loughborough University", + "official_name": "Loughborough Data Repository" + }, + "GESIS.ZPID": { + "openaire_id": "re3data_____::r3d100010328", + "datacite_name": "GESIS.ZPID", + "official_name": "PsychData" + }, + "SML.TCIA": { + "openaire_id": "re3data_____::r3d100011559", + "datacite_name": "The Cancer Imaging Archive", + "official_name": "The Cancer Imaging Archive" + }, + "CDL.IRIS": { + "openaire_id": "re3data_____::r3d100010268", + "datacite_name": "Incorporated Research Institutions for Seismology", + "official_name": "Incorporated Research Institutions for Seismology" + }, + "BIBSYS.NMDC": { + "openaire_id": "re3data_____::r3d100012291", + "datacite_name": "Norwegian Marine Data Centre", + "official_name": "Norwegian Polar Data Centre", + "similarity": 0.8727272727272727 + }, + "ANDS.CENTRE25": { + "openaire_id": "re3data_____::r3d100010917", + "datacite_name": "Geoscience Australia", + "official_name": "Geoscience Australia" + }, + "BL.UCLAN": { + "openaire_id": "re3data_____::r3d100012019", + "datacite_name": "University of Central Lancashire", + "official_name": "UCLanData" + }, + "ANDS.CENTRE23": { + "openaire_id": "re3data_____::r3d100011898", + "datacite_name": "The University of Western Australia", + "official_name": "UWA Research Repository" + }, + "CISTI.WOUDC": { + "openaire_id": "re3data_____::r3d100010367", + "datacite_name": "World Ozone and Ultraviolet Radiation Data Centre", + "official_name": "World Ozone and Ultraviolet Radiation Data Centre" + }, + "FIGSHARE.ARS": { + "openaire_id": "re3data_____::r3d10001066", + "datacite_name": "figshare Academic Research System", + "official_name": "figshare" + }, + "ILLINOIS.DATABANK": { + "openaire_id": "re3data_____::r3d100012001", + "datacite_name": "Illinois Data Bank", + "official_name": "Illinois Data Bank" + }, + "BL.ECMWF": { + "openaire_id": "re3data_____::r3d100011726", + "datacite_name": "European Centre for Medium-Range Weather Forecasts", + "official_name": "European Centre for Medium-Range Weather Forecasts" + }, + "CDL.ISSDA": { + "openaire_id": "re3data_____::r3d100010497", + "datacite_name": "Irish Social Science Data Archive (ISSDA)", + "official_name": "Irish Social Science Data Archive" + }, + "CDL.PQR": { + "openaire_id": "re3data_____::r3d100012225", + "datacite_name": "Pitt Quantum Repository", + "official_name": "Pitt Quantum Repository" + }, + "ANDS.CENTRE82": { + "openaire_id": "re3data_____::r3d100010138", + "datacite_name": "Test: Australian Data Archive", + "official_name": "Australian Data Archive", + "similarity": 0.8846153846153846 + }, + "GDCC.HARVARD-SLP": { + "openaire_id": "re3data_____::r3d100011861", + "datacite_name": "National Sleep Research Resource", + "official_name": "National Sleep Research Resource" + }, + "CDL.IMMPORT": { + "openaire_id": "re3data_____::r3d100012529", + "datacite_name": "UCSF ImmPort", + "official_name": "ImmPort" + }, + "GESIS.FID": { + "openaire_id": "re3data_____::r3d100012347", + "datacite_name": "FID f\u00fcr internationale und interdisziplin\u00e4re Rechtsforschung", + "official_name": "\u00b2Dok[\u00a7]" + }, + "OCEAN.OCEAN": { + "openaire_id": "re3data_____::r3d100012369", + "datacite_name": "Code Ocean", + "official_name": "Code Ocean" + }, + "CERN.ZENODO": { + "openaire_id": "re3data_____::r3d100010468", + "datacite_name": "Zenodo", + "official_name": "Zenodo" + }, + "ETHZ.DA-RD": { + "openaire_id": "re3data_____::r3d100011626", + "datacite_name": "ETHZ Data Archive - Research Data", + "official_name": "ETH Data Archive" + }, + "SND.ECDS": { + "openaire_id": "re3data_____::r3d100011000", + "datacite_name": "Environment Climate Data Sweden", + "official_name": "Environment Climate Data Sweden" + }, + "BL.BATH": { + "openaire_id": "re3data_____::r3d100011947", + "datacite_name": "University of Bath", + "official_name": "University of Bath Research Data Archive" + }, + "TIB.LDEO": { + "openaire_id": "re3data_____::r3d100012547", + "datacite_name": "LDEO - Lamont-Doherty Earth Observatory, Columbia University", + "official_name": "Lamont-Doherty Core Repository" + }, + "COS.OSF": { + "openaire_id": "re3data_____::r3d100011137", + "datacite_name": "Open Science Framework", + "official_name": "Open Science Framework" + }, + "ESTDOI.REPO": { + "openaire_id": "re3data_____::r3d100012333", + "datacite_name": "DataDOI", + "official_name": "DataDOI" + }, + "CDL.NSFADC": { + "openaire_id": "re3data_____::r3d100011973", + "datacite_name": "NSF Arctic Data Center", + "official_name": "NSF Arctic Data Center" + }, + "ANDS.CENTRE13": { + "openaire_id": "re3data_____::r3d100010477", + "datacite_name": "The Australian National University", + "official_name": "ANU Data Commons" + }, + "BL.NERC": { + "openaire_id": "re3data_____::r3d100010199", + "datacite_name": "Natural Environment Research Council", + "official_name": "Environmental Information Data Centre" + }, + "SAGEBIO.SYNAPSE": { + "openaire_id": "re3data_____::r3d100011894", + "datacite_name": "Synapse", + "official_name": "Synapse" + }, + "ANDS.CENTRE15": { + "openaire_id": "re3data_____::r3d100000038", + "datacite_name": "Australian Antarctic Division", + "official_name": "Australian Antarctic Data Centre" + }, + "WISC.BMRB": { + "openaire_id": "re3data_____::r3d100010191", + "datacite_name": "Biological Magnetic Resonance Bank", + "official_name": "Biological Magnetic Resonance Data Bank", + "similarity": 0.9315068493150684 + }, + "STSCI.MAST": { + "openaire_id": "re3data_____::r3d100010403", + "datacite_name": "Barbara A. Mikulski Archive for Space Telescopes", + "official_name": "Barbara A. Mikulski Archive for Space Telescopes" + }, + "CDL.NSIDC": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "BL.STRATH": { + "openaire_id": "re3data_____::r3d100012412", + "datacite_name": "University of Strathclyde", + "official_name": "University of Strathclyde KnowledgeBase Datasets" + }, + "DEMO.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "The Digital Archaeological Record (tDAR)", + "official_name": "tDAR" + }, + "TIND.CALTECH": { + "openaire_id": "re3data_____::r3d100012384", + "datacite_name": "CaltechDATA", + "official_name": "CaltechDATA" + }, + "GESIS.BIBB-FDZ": { + "openaire_id": "re3data_____::r3d100010190", + "datacite_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung", + "official_name": "Forschungsdatenzentrum im Bundesinstitut f\u00fcr Berufsbildung" + }, + "ANDS.CENTRE87": { + "openaire_id": "re3data_____::r3d100010138", + "datacite_name": "Australian Data Archive", + "official_name": "Australian Data Archive" + }, + "GESIS.NEPS": { + "openaire_id": "re3data_____::r3d100010736", + "datacite_name": "Nationales Bildungspanel (National Educational Panel Study, NEPS)", + "official_name": "Nationales Bildungspanel" + }, + "CDL.UCBCRCNS": { + "openaire_id": "re3data_____::r3d100011269", + "datacite_name": "Collaborative Research in Computational Neuroscience (CRCNS)", + "official_name": "Collaborative Research in Computational Neuroscience" + }, + "TIB.UKON": { + "openaire_id": "re3data_____::r3d100010469", + "datacite_name": "Movebank", + "official_name": "Movebank" + }, + "UMN.IPUMS": { + "openaire_id": "re3data_____::r3d100010794", + "datacite_name": "Minnesota Population Center", + "official_name": "Minnesota Population Center" + }, + "TIB.BIKF": { + "openaire_id": "re3data_____::r3d100012379", + "datacite_name": "Senckenberg Data & Metadata Repository", + "official_name": "Senckenberg Data & Metadata Repository" + }, + "TDL.GRIIDC": { + "openaire_id": "re3data_____::r3d100011571", + "datacite_name": "Gulf of Mexico Research Initiative Information and Data Cooperative", + "official_name": "Gulf of Mexico Research Initiative Information and Data Cooperative" + }, + "DELFT.NIBG": { + "openaire_id": "re3data_____::r3d100012167", + "datacite_name": "Sound and Vision", + "official_name": "Sound and Vision" + }, + "BL.SURREY": { + "openaire_id": "re3data_____::r3d100012232", + "datacite_name": "University of Surrey", + "official_name": "Surrey Research Insight" + }, + "OSTI.ORNLNGEE": { + "openaire_id": "re3data_____::r3d100011676", + "datacite_name": "NGEE-Arctic (Next Generation Ecosystems Experiement)", + "official_name": "NGEE Arctic" + }, + "TIB.WDCRSAT": { + "openaire_id": "re3data_____::r3d100010156", + "datacite_name": "World Data Center for Remote Sensing of the Atmosphere", + "official_name": "The World Data Center for Remote Sensing of the Atmosphere", + "similarity": 0.9642857142857143 + }, + "ZBMED.DSMZ": { + "openaire_id": "re3data_____::r3d100010219", + "datacite_name": "DSMZ", + "official_name": "DSMZ" + }, + "DOINZ.NZAU": { + "openaire_id": "re3data_____::r3d100012110", + "datacite_name": "University of Auckland Data Publishing and Discovery Service", + "official_name": "University of Auckland Data Repository" + }, + "INIST.RESIF": { + "openaire_id": "re3data_____::r3d100012222", + "datacite_name": "R\u00e9seau sismologique et g\u00e9od\u00e9sique fran\u00e7ais", + "official_name": "RESIF Seismic Data Portal" + }, + "CDL.NCEAS": { + "openaire_id": "re3data_____::r3d100010093", + "datacite_name": "National Center for Ecological Analysis and Synthesis (NCEAS)", + "official_name": "National Center for Ecological Analysis and Synthesis Data Repository" + }, + "ZBMED.EMP": { + "openaire_id": "re3data_____::r3d100010234", + "datacite_name": "eyeMoviePedia", + "official_name": "eyeMoviePedia" + }, + "ZBMED.BIOFRESH": { + "openaire_id": "re3data_____::r3d100011651", + "datacite_name": "Project BioFresh, Leibniz-Institute of Freshwater Ecology and Inland Fisheries", + "official_name": "Freshwater Biodiversity Data Portal" + }, + "INIST.IFREMER": { + "openaire_id": "re3data_____::r3d100011867", + "datacite_name": "Institut Fran\u00e7ais de Recherche pour l'Exploitation de la Mer", + "official_name": "SEANOE" + }, + "ETHZ.SICAS": { + "openaire_id": "re3data_____::r3d100011560", + "datacite_name": "SICAS", + "official_name": "Sicas Medical Image Repository" + }, + "SND.SND": { + "openaire_id": "re3data_____::r3d100010146", + "datacite_name": "Swedish National Data Service", + "official_name": "Swedish National Data Service" + }, + "DELFT.EASY": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "DANS", + "official_name": "DataverseNL" + }, + "WH.WHOAS": { + "openaire_id": "re3data_____::r3d100010423", + "datacite_name": "Woods Hole Open Access Server", + "official_name": "Woods Hole Open Access Server" + }, + "DATACITE.UCSC": { + "openaire_id": "re3data_____::r3d100010243", + "datacite_name": "UCSC Genome Browser", + "official_name": "UCSC Genome Browser" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json new file mode 100644 index 000000000..967e4445a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json @@ -0,0 +1,27 @@ +[ + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the sequencial file to write", + "paramRequired": true + }, + + { + "paramName": "d", + "paramLongName": "dataciteDumpPath", + "paramDescription": "the path of the Datacite dump", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml new file mode 100644 index 000000000..a3caa5e23 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -0,0 +1,103 @@ + + + + mdstoreInputPath + the path of the input MDStore + + + + mdstoreOutputPath + the path of the cleaned mdstore + + + nativeInputPath + the path of the input MDStore + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn-cluster + cluster + ImportDatacite + eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -t${nativeInputPath} + -d${mdstoreInputPath} + -n${nameNode} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + TransformJob + eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreInputPath} + --targetPath${mdstoreOutputPath} + --isLookupUrl${isLookupUrl} + -tr${isLookupUrl} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + ExportDataset + eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mdstoreOutputPath} + --targetPath${mdstoreOutputPath}_raw_AS + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 683986de2..170dc0dc8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -111,12 +111,12 @@ object DoiBoostMappingUtil { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } result.getInstance().asScala.foreach(i => { - i.setHostedby(getUbknownHostedBy()) + i.setHostedby(getUnknownHostedBy()) }) result } - def getUbknownHostedBy():KeyValue = { + def getUnknownHostedBy():KeyValue = { val hb = new KeyValue hb.setValue("Unknown Repository") hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala index 8043236e0..3ec391313 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/DLIToOAF.scala @@ -224,7 +224,7 @@ object DLIToOAF { if (cleanedPids.isEmpty) return null result.setId(generateId(inputPublication.getId)) - result.setDataInfo(generateDataInfo(invisibile = true)) + result.setDataInfo(generateDataInfo(invisible = true)) if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null)) return null result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava) From 0f8e2ecce6b8e55942bb56de4f4fdae462f25129 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 29 Jan 2021 10:45:07 +0100 Subject: [PATCH 08/86] Merged Datacite transfrom into this branch --- dhp-workflows/dhp-aggregation/pom.xml | 14 ++++-------- .../DataciteToOAFTransformation.scala | 14 +++++++++--- .../datacite/ImportDatacite.scala | 1 + .../datacite/oozie_app/workflow.xml | 22 ++++++++++++++++++- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 0445e0e1b..b61c3d443 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -37,7 +37,7 @@ - + @@ -58,15 +58,9 @@ eu.dnetlib.dhp dhp-common ${project.version} - - - com.sun.xml.bind - jaxb-core - - - - + + eu.dnetlib.dhp dhp-schemas ${project.version} @@ -116,4 +110,4 @@ - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 9418e71da..933f1445f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -11,9 +11,10 @@ import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse import java.nio.charset.CodingErrorAction +import java.text.SimpleDateFormat import java.time.LocalDate import java.time.format.DateTimeFormatter -import java.util.Locale +import java.util.{Date, Locale} import java.util.regex.Pattern import scala.collection.JavaConverters._ import scala.io.{Codec, Source} @@ -44,6 +45,8 @@ object DataciteToOAFTransformation { codec.onMalformedInput(CodingErrorAction.REPLACE) codec.onUnmappableCharacter(CodingErrorAction.REPLACE) + + private val PID_VOCABULARY = "dnet:pid_types" val COBJ_VOCABULARY = "dnet:publication_resource" val RESULT_VOCABULARY = "dnet:result_typologies" @@ -298,8 +301,13 @@ object DataciteToOAFTransformation { result.setPid(List(pid).asJava) result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) result.setOriginalId(List(doi).asJava) - result.setDateofcollection(s"${dateOfCollection}") - result.setDateoftransformation(s"$ts") + + val d = new Date(dateOfCollection*1000) + val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) + + + result.setDateofcollection(ISO8601FORMAT.format(d)) + result.setDateoftransformation(ISO8601FORMAT.format(ts)) result.setDataInfo(dataInfo) val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index 06fcbb518..d5edb674a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -108,6 +108,7 @@ object ImportDatacite { val cnt = writeSequenceFile(hdfsTargetPath, ts, conf) + log.info(s"Imported from Datacite API $cnt documents") if (cnt > 0) { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml index a3caa5e23..047794c9c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -13,13 +13,25 @@ nativeInputPath the path of the input MDStore + + - + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + ${wf:conf('resumeFrom') eq 'TransformJob'} + ${wf:conf('resumeFrom') eq 'ExportDataset'} + + + + yarn-cluster @@ -69,6 +81,14 @@ -tr${isLookupUrl} --masteryarn-cluster + + + + + + + + From d942d0c77d8ee9fbd9028ee31f1284117be295f1 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 29 Jan 2021 13:16:48 +0100 Subject: [PATCH 09/86] methods toString(), hashCode() and equals() --- .../mdstore/manager/common/model/MDStore.java | 20 ++++++++++++++++++ .../common/model/MDStoreCurrentVersion.java | 19 +++++++++++++++++ .../manager/common/model/MDStoreVersion.java | 20 ++++++++++++++++++ .../manager/common/model/MDStoreWithInfo.java | 21 +++++++++++++++++++ 4 files changed, 80 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index 345500737..db200cd6a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; +import java.util.Objects; import java.util.UUID; import javax.persistence.Column; @@ -153,4 +154,23 @@ public class MDStore implements Serializable { return md; } + @Override + public String toString() { + return String + .format("MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { return true; } + if (!(obj instanceof MDStore)) { return false; } + final MDStore other = (MDStore) obj; + return Objects.equals(id, other.id); + } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index f74ab39be..e25e7dc2a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -2,6 +2,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -48,4 +49,22 @@ public class MDStoreCurrentVersion implements Serializable { public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { return newInstance(v.getMdstore(), v.getId()); } + + @Override + public String toString() { + return String.format("MDStoreCurrentVersion [mdstore=%s, currentVersion=%s]", mdstore, currentVersion); + } + + @Override + public int hashCode() { + return Objects.hash(currentVersion, mdstore); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { return true; } + if (!(obj instanceof MDStoreCurrentVersion)) { return false; } + final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; + return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 62370c0f5..26c34fcad 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -111,4 +112,23 @@ public class MDStoreVersion implements Serializable { public void setHdfsPath(final String hdfsPath) { this.hdfsPath = hdfsPath; } + + @Override + public String toString() { + return String + .format("MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, mdstore, writing, readCount, lastUpdate, size, hdfsPath); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { return true; } + if (!(obj instanceof MDStoreVersion)) { return false; } + final MDStoreVersion other = (MDStoreVersion) obj; + return Objects.equals(id, other.id); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index 72915a9c8..e34e4c000 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; +import java.util.Objects; import javax.persistence.Column; import javax.persistence.Entity; @@ -163,4 +164,24 @@ public class MDStoreWithInfo implements Serializable { public void setHdfsPath(final String hdfsPath) { this.hdfsPath = hdfsPath; } + + @Override + public String toString() { + return String + .format("MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, lastUpdate, size, numberOfVersions, hdfsPath); + } + + @Override + public int hashCode() { + return Objects.hash(id); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { return true; } + if (!(obj instanceof MDStoreWithInfo)) { return false; } + final MDStoreWithInfo other = (MDStoreWithInfo) obj; + return Objects.equals(id, other.id); + } + } From 027618003951bcdc51cbd60b7d09163696795386 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 29 Jan 2021 16:42:41 +0100 Subject: [PATCH 10/86] WIP mdstore transaction implemented on hadoop side --- .../mdstore/manager/common/model/MDStore.java | 12 +- .../common/model/MDStoreCurrentVersion.java | 8 +- .../manager/common/model/MDStoreVersion.java | 12 +- .../manager/common/model/MDStoreWithInfo.java | 13 +- .../collector/worker/model/ApiDescriptor.java | 2 +- .../dhp/common/rest/DNetRestClient.java | 54 ++++++ .../mdstore/MDStoreActionNode.java | 164 ++++++++++++++++++ .../GenerateNativeStoreSparkJob.java | 46 ++++- .../collection/plugin/CollectorPlugin.java | 2 +- .../plugin/oai/OaiCollectorPlugin.java | 2 +- .../collection/worker/CollectorWorker.java | 5 +- .../worker/CollectorWorkerApplication.java | 28 ++- .../datacite/oozie_app/config-default.xml | 5 + .../collection_input_parameters.json | 12 +- .../dhp/collection/collector_parameter.json | 28 ++- .../collection/mdstore_action_parameters.json | 45 +++++ .../dhp/collection/oozie_app/workflow.xml | 114 ++++++++++-- .../DnetCollectorWorkerApplicationTests.java | 9 +- 18 files changed, 495 insertions(+), 66 deletions(-) rename dhp-common/src/main/java/eu/dnetlib/{ => dhp}/collector/worker/model/ApiDescriptor.java (93%) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index db200cd6a..59fe941ed 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -157,7 +157,9 @@ public class MDStore implements Serializable { @Override public String toString() { return String - .format("MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); + .format( + "MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); } @Override @@ -167,8 +169,12 @@ public class MDStore implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStore)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStore)) { + return false; + } final MDStore other = (MDStore) obj; return Objects.equals(id, other.id); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index e25e7dc2a..d808e2de7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -62,8 +62,12 @@ public class MDStoreCurrentVersion implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreCurrentVersion)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreCurrentVersion)) { + return false; + } final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 26c34fcad..38f8f275e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -116,7 +116,9 @@ public class MDStoreVersion implements Serializable { @Override public String toString() { return String - .format("MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, mdstore, writing, readCount, lastUpdate, size, hdfsPath); + .format( + "MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, + mdstore, writing, readCount, lastUpdate, size, hdfsPath); } @Override @@ -126,8 +128,12 @@ public class MDStoreVersion implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreVersion)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreVersion)) { + return false; + } final MDStoreVersion other = (MDStoreVersion) obj; return Objects.equals(id, other.id); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index e34e4c000..510c65092 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -168,7 +168,10 @@ public class MDStoreWithInfo implements Serializable { @Override public String toString() { return String - .format("MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, lastUpdate, size, numberOfVersions, hdfsPath); + .format( + "MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, + lastUpdate, size, numberOfVersions, hdfsPath); } @Override @@ -178,8 +181,12 @@ public class MDStoreWithInfo implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreWithInfo)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreWithInfo)) { + return false; + } final MDStoreWithInfo other = (MDStoreWithInfo) obj; return Objects.equals(id, other.id); } diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java similarity index 93% rename from dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java index bfd70e8c6..8ba30faeb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java @@ -1,5 +1,5 @@ -package eu.dnetlib.collector.worker.model; +package eu.dnetlib.dhp.collector.worker.model; import java.util.HashMap; import java.util.Map; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java new file mode 100644 index 000000000..014f18606 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.dhp.common.rest; + +import org.apache.commons.io.IOUtils; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class DNetRestClient { + + private static ObjectMapper mapper = new ObjectMapper(); + + public static T doGET(final String url, Class clazz) throws Exception { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet, clazz); + } + + public static String doGET(final String url) throws Exception { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet); + } + + public static String doPOST(final String url, V objParam) throws Exception { + final HttpPost httpPost = new HttpPost(url); + + if (objParam != null) { + final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam)); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + return doHTTPRequest(httpPost); + } + + public static T doPOST(final String url, V objParam, Class clazz) throws Exception { + return mapper.readValue(doPOST(url, objParam), clazz); + } + + private static String doHTTPRequest(final HttpUriRequest r) throws Exception { + CloseableHttpClient client = HttpClients.createDefault(); + CloseableHttpResponse response = client.execute(r); + return IOUtils.toString(response.getEntity().getContent()); + } + + private static T doHTTPRequest(final HttpUriRequest r, Class clazz) throws Exception { + return mapper.readValue(doHTTPRequest(r), clazz); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java new file mode 100644 index 000000000..d4824ed0a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -0,0 +1,164 @@ + +package eu.dnetlib.dhp.aggregation.mdstore; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.net.URI; +import java.util.Properties; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.CollectorWorker; +import eu.dnetlib.dhp.common.rest.DNetRestClient; + +public class MDStoreActionNode { + private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class); + + enum MDAction { + NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK + + } + + private static final ObjectMapper mapper = new ObjectMapper(); + + public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion"; + + public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s"; + public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort"; + + public static final String READ_LOCK_URL = "%s/mdstores/mdstore/%s/startReading"; + public static final String READ_UNLOCK_URL = "%s/mdstores/version/%s/endReading"; + + private static final String MDSTOREVERSIONPARAM = "mdStoreVersion"; + private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + CollectorWorker.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/mdstore_action_parameters.json"))); + argumentParser.parseArgument(args); + + final MDAction action = MDAction.valueOf(argumentParser.get("action")); + log.info("Curren action is {}", action); + + final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI"); + log.info("mdStoreManagerURI is {}", mdStoreManagerURI); + + switch (action) { + case NEW_VERSION: { + final String mdStoreID = argumentParser.get("mdStoreID"); + if (StringUtils.isBlank(mdStoreID)) { + throw new IllegalArgumentException("missing or empty argument mdStoreId"); + } + final MDStoreVersion currentVersion = DNetRestClient + .doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); + populateOOZIEEnv(MDSTOREVERSIONPARAM, mapper.writeValueAsString(currentVersion)); + break; + } + case COMMIT: { + + final String hdfsuri = argumentParser.get("namenode"); + if (StringUtils.isBlank(hdfsuri)) { + throw new IllegalArgumentException("missing or empty argument namenode"); + } + final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); + final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + // Get the filesystem - HDFS + FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); + String mdStoreSizeParam = argumentParser.get("mdStoreSize"); + + if (StringUtils.isBlank(mdStoreSizeParam)) { + throw new IllegalArgumentException("missing or empty argument mdStoreSize"); + } + Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + "/size"); + + FSDataInputStream inputStream = fs.open(hdfstoreSizepath); + + final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream)); + + inputStream.close(); + fs.create(hdfstoreSizepath); + + DNetRestClient + .doGET(String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize)); + break; + } + case ROLLBACK: { + final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); + final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId())); + break; + } + + case READ_LOCK: { + final String mdStoreID = argumentParser.get("mdStoreID"); + if (StringUtils.isBlank(mdStoreID)) { + throw new IllegalArgumentException("missing or empty argument mdStoreId"); + } + final MDStoreVersion currentVersion = DNetRestClient + .doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); + populateOOZIEEnv(MDSTOREREADLOCKPARAM, mapper.writeValueAsString(currentVersion)); + break; + } + case READ_UNLOCK: { + final String mdStoreVersion_params = argumentParser.get("readMDStoreId"); + final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + + if (StringUtils.isBlank(mdStoreVersion.getId())) { + throw new IllegalArgumentException( + "invalid MDStoreVersion value current is " + mdStoreVersion_params); + } + DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId())); + break; + } + + default: + throw new IllegalArgumentException("invalid action"); + } + + } + + public static void populateOOZIEEnv(final String paramName, String value) throws Exception { + File file = new File(System.getProperty("oozie.action.output.properties")); + Properties props = new Properties(); + + props.setProperty(paramName, value); + OutputStream os = new FileOutputStream(file); + props.store(os, ""); + os.close(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index c9c29b4ea..b28327a40 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -3,13 +3,17 @@ package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.ByteArrayInputStream; +import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Objects; import java.util.Optional; +import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -19,6 +23,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; @@ -28,7 +33,11 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication; +import eu.dnetlib.dhp.common.rest.DNetRestClient; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.message.MessageManager; @@ -36,6 +45,7 @@ import eu.dnetlib.message.MessageManager; public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); + private static final String DATASET_NAME = "/store"; public static void main(String[] args) throws Exception { @@ -50,11 +60,15 @@ public class GenerateNativeStoreSparkJob { final String provenanceArgument = parser.get("provenance"); log.info("Provenance is {}", provenanceArgument); final Provenance provenance = jsonMapper.readValue(provenanceArgument, Provenance.class); + final String dateOfCollectionArgs = parser.get("dateOfCollection"); log.info("dateOfCollection is {}", dateOfCollectionArgs); final long dateOfCollection = new Long(dateOfCollectionArgs); - final String sequenceFileInputPath = parser.get("input"); - log.info("sequenceFileInputPath is {}", dateOfCollectionArgs); + + String mdStoreVersion = parser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); + + final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class); Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) @@ -70,7 +84,9 @@ public class GenerateNativeStoreSparkJob { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaPairRDD inputRDD = sc - .sequenceFile(sequenceFileInputPath, IntWritable.class, Text.class); + .sequenceFile( + currentVersion.getHdfsPath() + CollectorWorkerApplication.SEQUENTIAL_FILE_NAME, + IntWritable.class, Text.class); final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); @@ -89,12 +105,26 @@ public class GenerateNativeStoreSparkJob { .distinct(); final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); + Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); - mdstore.write().format("parquet").save(parser.get("output")); + mdstore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(currentVersion.getHdfsPath() + DATASET_NAME); + mdstore = spark.read().load(currentVersion.getHdfsPath() + DATASET_NAME).as(encoder); + final Long total = mdstore.count(); + + FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + + FSDataOutputStream output = fs.create(new Path(currentVersion.getHdfsPath() + "/size")); + + final BufferedOutputStream os = new BufferedOutputStream(output); + + os.write(total.toString().getBytes(StandardCharsets.UTF_8)); + + os.close(); }); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 7146e610e..ba9bd662e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -3,8 +3,8 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; -import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public interface CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index c4c52271a..a5e261553 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -13,9 +13,9 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; -import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class OaiCollectorPlugin implements CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index 380db641a..3605bdfd6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -14,12 +14,9 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class CollectorWorker { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 5e8d0f9c2..29ae98c5b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -1,15 +1,22 @@ package eu.dnetlib.dhp.collection.worker; +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.Properties; + import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.common.rest.DNetRestClient; /** * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module @@ -24,6 +31,8 @@ public class CollectorWorkerApplication { private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); + public static String SEQUENTIAL_FILE_NAME = "/sequence_file"; + /** * @param args */ @@ -38,18 +47,23 @@ public class CollectorWorkerApplication { argumentParser.parseArgument(args); final String hdfsuri = argumentParser.get("namenode"); - log.info("hdfsURI is {}", hdfsuri); - final String hdfsPath = argumentParser.get("hdfsPath"); - log.info("hdfsPath is {}" + hdfsPath); + final String apiDescriptor = argumentParser.get("apidescriptor"); - log.info("apiDescriptor is {}" + apiDescriptor); + log.info("apiDescriptor is {}", apiDescriptor); + + final String mdStoreVersion = argumentParser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); + final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class); - final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, hdfsPath); + final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); + final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, + currentVersion.getHdfsPath() + SEQUENTIAL_FILE_NAME); worker.collect(); + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml index 2e0ed9aee..dd3c32c62 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml @@ -15,4 +15,9 @@ oozie.action.sharelib.for.spark spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json index 7f5113930..c1aa03bcd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json @@ -30,15 +30,9 @@ "paramRequired": true }, { - "paramName": "i", - "paramLongName": "input", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "o", - "paramLongName": "output", - "paramDescription": "the path of the result DataFrame on HDFS", + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Store Version Info", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json index 901664e0d..60e9762ff 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json @@ -1,6 +1,26 @@ [ - {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, - {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": false} + { + "paramName": "a", + "paramLongName": "apidescriptor", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the MDStore Version bean", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": false + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json new file mode 100644 index 000000000..57a218a34 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json @@ -0,0 +1,45 @@ +[ + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "mu", + "paramLongName": "mdStoreManagerURI", + "paramDescription": "the MDStore Manager URI", + "paramRequired": true + }, + { + "paramName": "mi", + "paramLongName": "mdStoreID", + "paramDescription": "the Metadata Store ID", + "paramRequired": false + }, + { + "paramName": "ms", + "paramLongName": "mdStoreSize", + "paramDescription": "the Metadata Store Size", + "paramRequired": false + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Version Bean", + "paramRequired": false + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": false + }, + { + "paramName": "rm", + "paramLongName": "readMDStoreId", + "paramDescription": "the ID Locked to Read", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 38cd83da7..28abe0965 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -1,10 +1,5 @@ - - mdStorePath - the path of the native mdstore - - apiDescription A json encoding of the API Description class @@ -16,7 +11,7 @@ identifierPath - An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier + An xpath to retrieve the metadata identifier for the generation of DNet Identifier @@ -33,26 +28,78 @@ workflowId The identifier of the workflow + + + mdStoreID + The identifier of the mdStore + + + + mdStoreManagerURI + The URI of the MDStore Manager + + + + collectionMode + Should be Refresh or Incremental + + ${jobTracker} ${nameNode} - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_LOCK + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication - --hdfsPath${workingDir}/sequenceFile_${mdstoreVersion} --apidescriptor${apiDescription} --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} - + @@ -75,13 +122,56 @@ --dateOfCollection${timestamp} --provenance${dataSourceInfo} --xpath${identifierPath} - --input${workingDir}/sequenceFile - --output${mdStorePath} - -w${workflowId} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + + + + + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --mdStoreVersion${wf:actionData('CollectionWorker')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index fc19f2064..9abfbacac 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -2,25 +2,18 @@ package eu.dnetlib.dhp.collector.worker; import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.mockito.Mockito.*; -import java.io.File; import java.nio.file.Path; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.CollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; +import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @Disabled public class DnetCollectorWorkerApplicationTests { From 8ee82576c686fb7f7ff6c840ce91b6e6809d1dc8 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 29 Jan 2021 17:02:46 +0100 Subject: [PATCH 11/86] Collection on Refresh WORKS!!! --- .../dhp/aggregation/mdstore/MDStoreActionNode.java | 8 ++------ .../dnetlib/dhp/collection/CollectionJobTest.java | 13 +++++++++++++ .../resources/eu/dnetlib/dhp/collection/input.json | 9 +++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java index d4824ed0a..6cb0537b2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -38,8 +38,8 @@ public class MDStoreActionNode { public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s"; public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort"; - public static final String READ_LOCK_URL = "%s/mdstores/mdstore/%s/startReading"; - public static final String READ_UNLOCK_URL = "%s/mdstores/version/%s/endReading"; + public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading"; + public static final String READ_UNLOCK_URL = "%s/version/%s/endReading"; private static final String MDSTOREVERSIONPARAM = "mdStoreVersion"; private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion"; @@ -94,11 +94,7 @@ public class MDStoreActionNode { System.setProperty("hadoop.home.dir", "/"); // Get the filesystem - HDFS FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); - String mdStoreSizeParam = argumentParser.get("mdStoreSize"); - if (StringUtils.isBlank(mdStoreSizeParam)) { - throw new IllegalArgumentException("missing or empty argument mdStoreSize"); - } Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + "/size"); FSDataInputStream inputStream = fs.open(hdfstoreSizepath); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index c3b05f5c9..6f7bb2bc2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -16,6 +16,8 @@ import org.junit.jupiter.api.io.TempDir; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreCurrentVersion; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -37,6 +39,17 @@ public class CollectionJobTest { spark.stop(); } + @Test + public void testJSONSerialization() throws Exception { + final String s = IOUtils.toString(getClass().getResourceAsStream("input.json")); + System.out.println("s = " + s); + final ObjectMapper mapper = new ObjectMapper(); + MDStoreVersion mi = mapper.readValue(s, MDStoreVersion.class); + + assertNotNull(mi); + + } + @Test public void tesCollection(@TempDir Path testDir) throws Exception { final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json new file mode 100644 index 000000000..4ffc33d24 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json @@ -0,0 +1,9 @@ +{ + "id": "md-7557225f-77cc-407d-bdf4-d2fe03131464-1611935085410", + "mdstore": "md-7557225f-77cc-407d-bdf4-d2fe03131464", + "writing": true, + "readCount": 0, + "lastUpdate": null, + "size": 0, + "hdfsPath": "/data/dnet.dev/mdstore/md-7557225f-77cc-407d-bdf4-d2fe03131464/md-7557225f-77cc-407d-bdf4-d2fe03131464-1611935085410" +} \ No newline at end of file From e423634cb6f1a7c301e5af04a9de8246e1e53d0c Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 29 Jan 2021 17:21:42 +0100 Subject: [PATCH 12/86] RollBack in case of error WORKS!!! --- .../eu/dnetlib/dhp/collection/oozie_app/workflow.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 28abe0965..527ec1727 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -41,7 +41,7 @@ collectionMode - Should be Refresh or Incremental + Should be REFRESH or INCREMENTAL @@ -164,8 +164,8 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode - --actionCOMMIT - --mdStoreVersion${wf:actionData('CollectionWorker')['mdStoreVersion']} + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --mdStoreManagerURI${mdStoreManagerURI} From b6b835ef49f3977cd43f1e5a1087720015e3a1ee Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 1 Feb 2021 08:49:42 +0100 Subject: [PATCH 13/86] update transformation Factory to get Transformation Rule by Id and not by Title --- .../dnetlib/dhp/transformation/TransformationFactory.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java index 58292139a..fbaef1d1f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -18,7 +18,7 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class TransformationFactory { private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); - public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//TITLE = \"%s\" return $x//CODE/text()"; + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/text()"; public static MapFunction getTransformationPlugin( final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) @@ -54,15 +54,15 @@ public class TransformationFactory { } } - private static String queryTransformationRuleFromIS(final String transformationRuleName, + private static String queryTransformationRuleFromIS(final String transformationRuleId, final ISLookUpService isLookUpService) throws Exception { - final String query = String.format(TRULE_XQUERY, transformationRuleName); + final String query = String.format(TRULE_XQUERY, transformationRuleId); log.info("asking query to IS: " + query); List result = isLookUpService.quickSearchProfile(query); if (result == null || result.isEmpty()) throw new DnetTransformationException( - "Unable to find transformation rule with name: " + transformationRuleName); + "Unable to find transformation rule with name: " + transformationRuleId); return result.get(0); } From 6ff234d81bd4cdec1c1c1745b24a7c3901e90afb Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 1 Feb 2021 13:56:05 +0100 Subject: [PATCH 14/86] Implemented a first prototype of incremental harvesting and trasformation using readlock --- dhp-common/pom.xml | 6 ++ .../common/AggregationUtility.java | 28 ++++++ .../GenerateNativeStoreSparkJob.java | 97 +++++++++++++++---- .../transformation/TransformSparkJobNode.java | 39 ++++---- .../transformation/TransformationFactory.java | 6 +- .../collection_input_parameters.json | 6 ++ .../collection/oozie_app/config-default.xml | 4 + .../dhp/collection/oozie_app/workflow.xml | 33 ++++++- .../oozie_app/config-default.xml | 4 + .../dhp/transformation/oozie_app/workflow.xml | 96 +++++++++++++++--- .../transformation_input_parameters.json | 10 +- .../eu/dnetlib/dhp/transform/ext_simple.xsl | 4 +- .../eu/dnetlib/dhp/transform/input.xml | 96 ++++++------------ 13 files changed, 297 insertions(+), 132 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index b295bc1f1..6eb2e0358 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -98,6 +98,12 @@ dnet-pace-core + + org.apache.httpcomponents + httpclient + + + eu.dnetlib.dhp dhp-schemas diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java new file mode 100644 index 000000000..1f5ed27cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java @@ -0,0 +1,28 @@ + +package eu.dnetlib.dhp.aggregation.common; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.sql.SparkSession; + +public class AggregationUtility { + + public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path) + throws IOException { + + FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + + FSDataOutputStream output = fs.create(new Path(path)); + + final BufferedOutputStream os = new BufferedOutputStream(output); + + os.write(total.toString().getBytes(StandardCharsets.UTF_8)); + + os.close(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index b28327a40..466ddcd21 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -5,9 +5,9 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.*; import java.nio.charset.StandardCharsets; +import java.util.Collections; import java.util.Objects; import java.util.Optional; -import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -20,10 +20,9 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; @@ -34,19 +33,62 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode; +import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication; -import eu.dnetlib.dhp.common.rest.DNetRestClient; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.message.MessageManager; +import scala.Tuple2; public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); private static final String DATASET_NAME = "/store"; + public static class MDStoreAggregator extends Aggregator { + + @Override + public MetadataRecord zero() { + return new MetadataRecord(); + } + + @Override + public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) { + + return getLatestRecord(b, a); + } + + private MetadataRecord getLatestRecord(MetadataRecord b, MetadataRecord a) { + if (b == null) + return a; + + if (a == null) + return b; + return (a.getDateOfCollection() > b.getDateOfCollection()) ? a : b; + } + + @Override + public MetadataRecord merge(MetadataRecord b, MetadataRecord a) { + return getLatestRecord(b, a); + } + + @Override + public MetadataRecord finish(MetadataRecord j) { + return j; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(MetadataRecord.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(MetadataRecord.class); + } + + } + public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -70,6 +112,12 @@ public class GenerateNativeStoreSparkJob { final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class); + String readMdStoreVersionParam = parser.get("readMdStoreVersion"); + log.info("readMdStoreVersion is {}", readMdStoreVersionParam); + + final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null + : jsonMapper.readValue(readMdStoreVersionParam, MDStoreVersion.class); + Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) @@ -77,6 +125,9 @@ public class GenerateNativeStoreSparkJob { log.info("isSparkSessionManaged: {}", isSparkSessionManaged); SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(Collections.singleton(MetadataRecord.class).toArray(new Class[] {})); + runWithSparkSession( conf, isSparkSessionManaged, @@ -105,8 +156,27 @@ public class GenerateNativeStoreSparkJob { .distinct(); final Encoder encoder = Encoders.bean(MetadataRecord.class); + Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + if (readMdStoreVersion != null) { + // INCREMENTAL MODE + + Dataset currentMdStoreVersion = spark + .read() + .load(readMdStoreVersion.getHdfsPath() + DATASET_NAME) + .as(encoder); + TypedColumn aggregator = new MDStoreAggregator().toColumn(); + + mdstore = currentMdStoreVersion + .union(mdstore) + .groupByKey( + (MapFunction) MetadataRecord::getId, + Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder); + + } mdstore .write() .mode(SaveMode.Overwrite) @@ -116,17 +186,8 @@ public class GenerateNativeStoreSparkJob { final Long total = mdstore.count(); - FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - - FSDataOutputStream output = fs.create(new Path(currentVersion.getHdfsPath() + "/size")); - - final BufferedOutputStream os = new BufferedOutputStream(output); - - os.write(total.toString().getBytes(StandardCharsets.UTF_8)); - - os.close(); + AggregationUtility.writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + "/size"); }); - } public static MetadataRecord parseRecord( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index c6ed5a1e3..b9df902a1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -3,14 +3,11 @@ package eu.dnetlib.dhp.transformation; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.io.ByteArrayInputStream; -import java.util.HashMap; +import java.io.IOException; import java.util.Map; -import java.util.Objects; import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -18,25 +15,18 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; -import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; public class TransformSparkJobNode { @@ -59,10 +49,14 @@ public class TransformSparkJobNode { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("mdstoreInputPath"); - final String outputPath = parser.get("mdstoreOutputPath"); + final String mdstoreInputVersion = parser.get("mdstoreInputVersion"); + final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion"); // TODO this variable will be used after implementing Messaging with DNet Aggregator + final ObjectMapper jsonMapper = new ObjectMapper(); + final MDStoreVersion nativeMdStoreVersion = jsonMapper.readValue(mdstoreInputVersion, MDStoreVersion.class); + final MDStoreVersion cleanedMdStoreVersion = jsonMapper.readValue(mdstoreOutputVersion, MDStoreVersion.class); + final String isLookupUrl = parser.get("isLookupUrl"); log.info(String.format("isLookupUrl: %s", isLookupUrl)); @@ -72,11 +66,14 @@ public class TransformSparkJobNode { runWithSparkSession( conf, isSparkSessionManaged, - spark -> transformRecords(parser.getObjectMap(), isLookupService, spark, inputPath, outputPath)); + spark -> transformRecords( + parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath(), + cleanedMdStoreVersion.getHdfsPath())); } public static void transformRecords(final Map args, final ISLookUpService isLookUpService, - final SparkSession spark, final String inputPath, final String outputPath) throws DnetTransformationException { + final SparkSession spark, final String inputPath, final String outputPath) + throws DnetTransformationException, IOException { final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); @@ -86,11 +83,13 @@ public class TransformSparkJobNode { final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); final MapFunction XSLTTransformationFunction = TransformationFactory .getTransformationPlugin(args, ct, isLookUpService); - mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath); + mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath + "/store"); log.info("Transformed item " + ct.getProcessedItems().count()); log.info("Total item " + ct.getTotalItems().count()); log.info("Transformation Error item " + ct.getErrorItems().count()); + + AggregationUtility.writeTotalSizeOnHDFS(spark, ct.getProcessedItems().count(), outputPath + "/size"); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java index fbaef1d1f..d1f896964 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -30,13 +30,13 @@ public class TransformationFactory { log.info("Transformation plugin required " + transformationPlugin); switch (transformationPlugin) { case "XSLT_TRANSFORM": { - final String transformationRuleName = jobArgument.get("transformationRuleTitle"); - if (StringUtils.isBlank(transformationRuleName)) + final String transformationRuleId = jobArgument.get("transformationRuleId"); + if (StringUtils.isBlank(transformationRuleId)) throw new DnetTransformationException("Missing Parameter transformationRule"); final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); final String transformationRule = queryTransformationRuleFromIS( - transformationRuleName, isLookupService); + transformationRuleId, isLookupService); final long dateOfTransformation = new Long(jobArgument.get("dateOfTransformation")); return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation, diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json index c1aa03bcd..987f004bb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json @@ -35,6 +35,12 @@ "paramDescription": "the Metadata Store Version Info", "paramRequired": true }, + { + "paramName": "rmv", + "paramLongName": "readMdStoreVersion", + "paramDescription": "the Read Lock Metadata Store Version bean", + "paramRequired": false + }, { "paramName": "w", "paramLongName": "workflowId", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml index 2e0ed9aee..e77dd09c9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml @@ -15,4 +15,8 @@ oozie.action.sharelib.for.spark spark2 + + oozie.launcher.mapreduce.user.classpath.first + true + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 527ec1727..9c213bee5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -51,7 +51,7 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -61,7 +61,7 @@ ${wf:conf('collectionMode') eq 'REFRESH'} ${wf:conf('collectionMode') eq 'INCREMENTAL'} - + @@ -99,7 +99,7 @@ --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} - + @@ -123,9 +123,10 @@ --provenance${dataSourceInfo} --xpath${identifierPath} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --readMdStoreVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} - + @@ -133,7 +134,7 @@ ${wf:conf('collectionMode') eq 'REFRESH'} ${wf:conf('collectionMode') eq 'INCREMENTAL'} - + @@ -161,6 +162,28 @@ + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml index 2e0ed9aee..e77dd09c9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml @@ -15,4 +15,8 @@ oozie.action.sharelib.for.spark spark2 + + oozie.launcher.mapreduce.user.classpath.first + true + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index b36bc3766..aff87dc79 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -1,25 +1,25 @@ - mdstoreInputPath - the path of the native MDStore + mdStoreInputId + the identifier of the native MDStore - - mdstoreOutputPath + mdStoreOutputId + the identifier of the cleaned MDStore + + + mdStoreManagerURI the path of the cleaned mdstore - - transformationRuleTitle + transformationRuleId The transformation Rule to apply - transformationPlugin The transformation Plugin - dateOfTransformation The timestamp of the transformation date @@ -28,11 +28,34 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_LOCK + --mdStoreID${mdStoreInputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreOutputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + yarn @@ -49,18 +72,63 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --mdstoreInputPath${mdstoreInputPath} - --mdstoreOutputPath${mdstoreOutputPath} + --mdstoreInputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdstoreOutputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} --dateOfTransformation${dateOfTransformation} --transformationPlugin${transformationPlugin} - --transformationRuleTitle${transformationRuleTitle} - - + --transformationRuleId${transformationRuleId} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index cbd2f25ab..d92698de5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -13,19 +13,19 @@ }, { "paramName": "i", - "paramLongName": "mdstoreInputPath", - "paramDescription": "the path of the sequencial file to read", + "paramLongName": "mdstoreInputVersion", + "paramDescription": "the mdStore Version bean of the Input", "paramRequired": true }, { "paramName": "o", - "paramLongName": "mdstoreOutputPath", - "paramDescription": "the path of the result DataFrame on HDFS", + "paramLongName": "mdstoreOutputVersion", + "paramDescription": "the mdStore Version bean of the Output", "paramRequired": true }, { "paramName": "tr", - "paramLongName": "transformationRuleTitle", + "paramLongName": "transformationRuleId", "paramDescription": "the transformation Rule to apply to the input MDStore", "paramRequired": true }, diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index 9e5f84c11..becd3a05e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -9,7 +9,9 @@ - + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml index 8efb3c487..ebe8e919b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml @@ -1,68 +1,32 @@ - - - - od______2294::00029b7f0a2a7e090e55b625a9079d83 - oai:pub.uni-bielefeld.de:2578942 - 2018-11-23T15:15:33.974+01:00 - od______2294 - oai:pub.uni-bielefeld.de:2578942 - 2018-07-24T13:01:16Z - conference - ddc:000 - conferenceFtxt - driver - open_access - - - - Mobile recommendation agents making online use of visual attention information at the point of sale - Pfeiffer, Thies - Pfeiffer, Jella - Meißner, Martin - Davis, Fred - Riedl, René - Jan, vom Brocke - Léger, Pierre-Majorique - Randolph, Adriane - Mobile Cognitive Assistance Systems - Information Systems - ddc:000 - We aim to utilize online information about visual attention for developing mobile recommendation agents (RAs) for use at the point of sale. Up to now, most RAs are focussed exclusively at personalization in an e-commerce setting. Very little is known, however, about mobile RAs that offer information and assistance at the point of sale based on individual-level feature based preference models (Murray and Häubl 2009). Current attempts provide information about products at the point of sale by manually scanning barcodes or using RFID (Kowatsch et al. 2011, Heijden 2005), e.g. using specific apps for smartphones. We argue that an online access to the current visual attention of the user offers a much larger potential. Integrating mobile eye tracking into ordinary glasses would yield a direct benefit of applying neuroscience methods in the user’s everyday life. First, learning from consumers’ attentional processes over time and adapting recommendations based on this learning allows us to provide very accurate and relevant recommendations, potentially increasing the perceived usefulness. Second, our proposed system needs little explicit user input (no scanning or navigation on screen) making it easy to use. Thus, instead of learning from click behaviour and past customer ratings, as it is the case in the e-commerce setting, the mobile RA learns from eye movements by participating online in every day decision processes. We argue that mobile RAs should be built based on current research in human judgment and decision making (Murray et al. 2010). In our project, we therefore follow a two-step approach: In the empirical basic research stream, we aim to understand the user’s interaction with the product shelf: the actions and patterns of user’s behaviour (eye movements, gestures, approaching a product closer) and their correspondence to the user’s informational needs. In the empirical system development stream, we create prototypes of mobile RAs and test experimentally the factors that influence the user’s adoption. For example, we suggest that a user’s involvement in the process, such as a need for exact nutritional information or for assistance (e.g., reading support for elderly) will influence the user’s intention to use such as system. The experiments are conducted both in our immersive virtual reality supermarket presented in a CAVE, where we can also easily display information to the user and track the eye movement in great accuracy, as well as in real-world supermarkets (see Figure 1), so that the findings can be better generalized to natural decision situations (Gidlöf et al. 2013). In a first pilot study with five randomly chosen participants in a supermarket, we evaluated which sort of mobile RAs consumers favour in order to get a first impression of the user’s acceptance of the technology. Figure 1 shows an excerpt of one consumer’s eye movements during a decision process. First results show long eye cascades and short fixations on many products in situations where users are uncertain and in need for support. Furthermore, we find a surprising acceptance of the technology itself throughout all ages (23 – 61 years). At the same time, consumers express serious fear of being manipulated by such a technology. For that reason, they strongly prefer the information to be provided by trusted third party or shared with family members and friends (see also Murray and Häubl 2009). Our pilot will be followed by a larger field experiment in March in order to learn more about factors that influence the user’s acceptance as well as the eye movement patterns that reflect typical phases of decision processes and indicate the need for support by a RA. - 2013 - info:eu-repo/semantics/conferenceObject - doc-type:conferenceObject - text - https://pub.uni-bielefeld.de/record/2578942 - https://pub.uni-bielefeld.de/download/2578942/2602478 - Pfeiffer T, Pfeiffer J, Meißner M. Mobile recommendation agents making online use of visual attention information at the point of sale. In: Davis F, Riedl R, Jan vom B, Léger P-M, Randolph A, eds. Proceedings of the Gmunden Retreat on NeuroIS 2013. 2013: 3-3. - eng - info:eu-repo/semantics/openAccess + +
+ oai:lib.psnc.pl:278 + 2011-08-25T15:17:13Z + PSNCRepository:PSNCExternalRepository:exhibitions + PSNCRepository:PSNCExternalRepository:Departments + PSNCRepository:PSNCExternalRepository:Departments:NetworkServices + PSNCRepository:PSNCExternalRepository + PSNCRepository:PSNCExternalRepository:publications + PSNCRepository +
+ + + + + + + + + + + + + + + + - - - - http://pub.uni-bielefeld.de/oai - oai:pub.uni-bielefeld.de:2578942 - 2018-07-24T13:01:16Z - http://www.openarchives.org/OAI/2.0/oai_dc/ - - - - false - false - 0.9 - - - - -
+ \ No newline at end of file From bead34d11a889b716a256fb0b763995d2c220f0f Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 1 Feb 2021 14:58:06 +0100 Subject: [PATCH 15/86] code refactor --- .../GenerateNativeStoreSparkJob.java | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 466ddcd21..553a3dc5f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -157,8 +157,9 @@ public class GenerateNativeStoreSparkJob { final Encoder encoder = Encoders.bean(MetadataRecord.class); - Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + final String targetPath = currentVersion.getHdfsPath() + DATASET_NAME; if (readMdStoreVersion != null) { // INCREMENTAL MODE @@ -168,28 +169,35 @@ public class GenerateNativeStoreSparkJob { .as(encoder); TypedColumn aggregator = new MDStoreAggregator().toColumn(); - mdstore = currentMdStoreVersion - .union(mdstore) - .groupByKey( - (MapFunction) MetadataRecord::getId, - Encoders.STRING()) - .agg(aggregator) - .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder); + saveDataset( + currentMdStoreVersion + .union(mdstore) + .groupByKey( + (MapFunction) MetadataRecord::getId, + Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder), + targetPath); + } else { + saveDataset(mdstore, targetPath); } - mdstore - .write() - .mode(SaveMode.Overwrite) - .format("parquet") - .save(currentVersion.getHdfsPath() + DATASET_NAME); - mdstore = spark.read().load(currentVersion.getHdfsPath() + DATASET_NAME).as(encoder); - final Long total = mdstore.count(); + final Long total = spark.read().load(targetPath).count(); AggregationUtility.writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + "/size"); }); } + private static void saveDataset(final Dataset currentMdStore, final String targetPath) { + currentMdStore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(targetPath); + + } + public static MetadataRecord parseRecord( final String input, final String xpath, From 8eaa1fd4b411c6757bc75b9f38a155b106e97a29 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 1 Feb 2021 19:29:10 +0100 Subject: [PATCH 16/86] WIP: metadata collection in INCREMENTAL mode and relative test --- .../dhp/model/mdstore/MetadataRecord.java | 16 +- .../common/AggregationUtility.java | 33 ++- .../GenerateNativeStoreSparkJob.java | 259 +++++++++--------- .../worker/CollectorWorkerApplication.java | 10 +- .../GenerateNativeStoreSparkJobTest.java | 169 ++++++++++++ .../eu/dnetlib/dhp/collection/input.json | 9 - .../dhp/collection/mdStoreVersion_1.json | 9 + .../dhp/collection/mdStoreVersion_2.json | 9 + .../eu/dnetlib/dhp/collection/provenance.json | 5 + .../eu/dnetlib/dhp/collection/sequence_file | Bin 0 -> 52308 bytes 10 files changed, 360 insertions(+), 159 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java delete mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java index ce65e710f..0b59dcce0 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java @@ -26,13 +26,13 @@ public class MetadataRecord implements Serializable { private String body; /** the date when the record has been stored */ - private long dateOfCollection; + private Long dateOfCollection; /** the date when the record has been stored */ - private long dateOfTransformation; + private Long dateOfTransformation; public MetadataRecord() { - this.dateOfCollection = System.currentTimeMillis(); + } public MetadataRecord( @@ -40,7 +40,7 @@ public class MetadataRecord implements Serializable { String encoding, Provenance provenance, String body, - long dateOfCollection) { + Long dateOfCollection) { this.originalId = originalId; this.encoding = encoding; @@ -90,19 +90,19 @@ public class MetadataRecord implements Serializable { this.body = body; } - public long getDateOfCollection() { + public Long getDateOfCollection() { return dateOfCollection; } - public void setDateOfCollection(long dateOfCollection) { + public void setDateOfCollection(Long dateOfCollection) { this.dateOfCollection = dateOfCollection; } - public long getDateOfTransformation() { + public Long getDateOfTransformation() { return dateOfTransformation; } - public void setDateOfTransformation(long dateOfTransformation) { + public void setDateOfTransformation(Long dateOfTransformation) { this.dateOfTransformation = dateOfTransformation; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java index 1f5ed27cb..eb971c475 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java @@ -8,21 +8,38 @@ import java.nio.charset.StandardCharsets; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; public class AggregationUtility { + private static final Logger log = LoggerFactory.getLogger(AggregationUtility.class); + public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path) throws IOException { - FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + log.info("writing size ({}) info file {}", total, path); + try (FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { + os.write(total.toString().getBytes(StandardCharsets.UTF_8)); + os.flush(); + } - FSDataOutputStream output = fs.create(new Path(path)); - - final BufferedOutputStream os = new BufferedOutputStream(output); - - os.write(total.toString().getBytes(StandardCharsets.UTF_8)); - - os.close(); } + + public static void saveDataset(final Dataset mdstore, final String targetPath) { + log.info("saving dataset in: {}", targetPath); + mdstore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(targetPath); + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 553a3dc5f..bbed36a9c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,23 +1,20 @@ package eu.dnetlib.dhp.collection; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.*; import java.nio.charset.StandardCharsets; -import java.util.Collections; +import java.util.List; import java.util.Objects; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; @@ -30,31 +27,155 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.Provenance; +import net.sf.saxon.expr.Component; import scala.Tuple2; public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String DATASET_NAME = "/store"; + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + parser.parseArgument(args); + + final String provenanceArgument = parser.get("provenance"); + log.info("Provenance is {}", provenanceArgument); + final Provenance provenance = MAPPER.readValue(provenanceArgument, Provenance.class); + + final String dateOfCollectionArgs = parser.get("dateOfCollection"); + log.info("dateOfCollection is {}", dateOfCollectionArgs); + final Long dateOfCollection = new Long(dateOfCollectionArgs); + + String mdStoreVersion = parser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); + + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); + + String readMdStoreVersionParam = parser.get("readMdStoreVersion"); + log.info("readMdStoreVersion is {}", readMdStoreVersionParam); + + final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null + : MAPPER.readValue(readMdStoreVersionParam, MDStoreVersion.class); + + final String xpath = parser.get("xpath"); + log.info("xpath is {}", xpath); + + final String encoding = parser.get("encoding"); + log.info("encoding is {}", encoding); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + SparkConf conf = new SparkConf(); + /* + * conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf .registerKryoClasses( new + * Class[] { MetadataRecord.class, Provenance.class }); + */ + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> createNativeMDStore( + spark, provenance, dateOfCollection, xpath, encoding, currentVersion, readMdStoreVersion)); + } + + private static void createNativeMDStore(SparkSession spark, + Provenance provenance, + Long dateOfCollection, + String xpath, + String encoding, + MDStoreVersion currentVersion, + MDStoreVersion readVersion) throws IOException { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); + final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + + final String seqFilePath = currentVersion.getHdfsPath() + CollectorWorkerApplication.SEQUENCE_FILE_NAME; + final JavaRDD nativeStore = sc + .sequenceFile(seqFilePath, IntWritable.class, Text.class) + .map( + item -> parseRecord( + item._2().toString(), + xpath, + encoding, + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); + + final String targetPath = currentVersion.getHdfsPath() + DATASET_NAME; + + if (readVersion != null) { // INCREMENTAL MODE + log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath()); + Dataset currentMdStoreVersion = spark + .read() + .load(readVersion.getHdfsPath() + DATASET_NAME) + .as(encoder); + TypedColumn aggregator = new MDStoreAggregator().toColumn(); + + final Dataset map = currentMdStoreVersion + .union(mdstore) + .groupByKey( + (MapFunction) MetadataRecord::getId, + Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder); + + map.select("id").takeAsList(100).forEach(s -> log.info(s.toString())); + + saveDataset(map, targetPath); + + } else { + saveDataset(mdstore, targetPath); + } + + final Long total = spark.read().load(targetPath).count(); + log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName()); + + writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + "/size"); + } + public static class MDStoreAggregator extends Aggregator { @Override public MetadataRecord zero() { - return new MetadataRecord(); + return null; } @Override public MetadataRecord reduce(MetadataRecord b, MetadataRecord a) { + return getLatestRecord(b, a); + } + @Override + public MetadataRecord merge(MetadataRecord b, MetadataRecord a) { return getLatestRecord(b, a); } @@ -68,136 +189,22 @@ public class GenerateNativeStoreSparkJob { } @Override - public MetadataRecord merge(MetadataRecord b, MetadataRecord a) { - return getLatestRecord(b, a); - } - - @Override - public MetadataRecord finish(MetadataRecord j) { - return j; + public MetadataRecord finish(MetadataRecord r) { + return r; } @Override public Encoder bufferEncoder() { - return Encoders.kryo(MetadataRecord.class); + return Encoders.bean(MetadataRecord.class); } @Override public Encoder outputEncoder() { - return Encoders.kryo(MetadataRecord.class); + return Encoders.bean(MetadataRecord.class); } } - public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - GenerateNativeStoreSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); - parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final String provenanceArgument = parser.get("provenance"); - log.info("Provenance is {}", provenanceArgument); - final Provenance provenance = jsonMapper.readValue(provenanceArgument, Provenance.class); - - final String dateOfCollectionArgs = parser.get("dateOfCollection"); - log.info("dateOfCollection is {}", dateOfCollectionArgs); - final long dateOfCollection = new Long(dateOfCollectionArgs); - - String mdStoreVersion = parser.get("mdStoreVersion"); - log.info("mdStoreVersion is {}", mdStoreVersion); - - final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class); - - String readMdStoreVersionParam = parser.get("readMdStoreVersion"); - log.info("readMdStoreVersion is {}", readMdStoreVersionParam); - - final MDStoreVersion readMdStoreVersion = StringUtils.isBlank(readMdStoreVersionParam) ? null - : jsonMapper.readValue(readMdStoreVersionParam, MDStoreVersion.class); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(Collections.singleton(MetadataRecord.class).toArray(new Class[] {})); - - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - final JavaPairRDD inputRDD = sc - .sequenceFile( - currentVersion.getHdfsPath() + CollectorWorkerApplication.SEQUENTIAL_FILE_NAME, - IntWritable.class, Text.class); - - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - - final JavaRDD nativeStore = inputRDD - .map( - item -> parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); - - final Encoder encoder = Encoders.bean(MetadataRecord.class); - - final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); - - final String targetPath = currentVersion.getHdfsPath() + DATASET_NAME; - if (readMdStoreVersion != null) { - // INCREMENTAL MODE - - Dataset currentMdStoreVersion = spark - .read() - .load(readMdStoreVersion.getHdfsPath() + DATASET_NAME) - .as(encoder); - TypedColumn aggregator = new MDStoreAggregator().toColumn(); - - saveDataset( - currentMdStoreVersion - .union(mdstore) - .groupByKey( - (MapFunction) MetadataRecord::getId, - Encoders.STRING()) - .agg(aggregator) - .map((MapFunction, MetadataRecord>) Tuple2::_2, encoder), - targetPath); - - } else { - saveDataset(mdstore, targetPath); - } - - final Long total = spark.read().load(targetPath).count(); - - AggregationUtility.writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + "/size"); - }); - } - - private static void saveDataset(final Dataset currentMdStore, final String targetPath) { - currentMdStore - .write() - .mode(SaveMode.Overwrite) - .format("parquet") - .save(targetPath); - - } - public static MetadataRecord parseRecord( final String input, final String xpath, @@ -219,7 +226,7 @@ public class GenerateNativeStoreSparkJob { invalidRecords.add(1); return null; } - return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); + return new MetadataRecord(originalIdentifier, encoding, provenance, document.asXML(), dateOfCollection); } catch (Throwable e) { invalidRecords.add(1); return null; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 29ae98c5b..e24b9ad1d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -1,11 +1,6 @@ package eu.dnetlib.dhp.collection.worker; -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; -import java.util.Properties; - import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,7 +11,6 @@ import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.common.rest.DNetRestClient; /** * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module @@ -31,7 +25,7 @@ public class CollectorWorkerApplication { private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - public static String SEQUENTIAL_FILE_NAME = "/sequence_file"; + public static String SEQUENCE_FILE_NAME = "/sequence_file"; /** * @param args @@ -61,7 +55,7 @@ public class CollectorWorkerApplication { final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, - currentVersion.getHdfsPath() + SEQUENTIAL_FILE_NAME); + currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME); worker.collect(); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java new file mode 100644 index 000000000..715ad8fa6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java @@ -0,0 +1,169 @@ + +package eu.dnetlib.dhp.collection; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +public class GenerateNativeStoreSparkJobTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static Encoder encoder; + + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; + + private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJobTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + provenance = IOUtils.toString(GenerateNativeStoreSparkJobTest.class.getResourceAsStream("provenance.json")); + workingDir = Files.createTempDirectory(GenerateNativeStoreSparkJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + + conf.setAppName(GenerateNativeStoreSparkJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + encoder = Encoders.bean(MetadataRecord.class); + spark = SparkSession + .builder() + .appName(GenerateNativeStoreSparkJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + @Order(1) + public void testGenerateNativeStoreSparkJobRefresh() throws Exception { + + MDStoreVersion mdStoreV1 = prepareVersion("mdStoreVersion_1.json"); + FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("sequence_file"), + new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); + + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-readMdStoreVersion", "", + "-workflowId", "abc" + }); + + verify(mdStoreV1); + } + + @Test + @Order(2) + public void testGenerateNativeStoreSparkJobIncremental() throws Exception { + + MDStoreVersion mdStoreV2 = prepareVersion("mdStoreVersion_2.json"); + FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("sequence_file"), + new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); + + MDStoreVersion mdStoreV1 = prepareVersion("mdStoreVersion_1.json"); + + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-workflowId", "abc" + }); + + verify(mdStoreV2); + } + + protected void verify(MDStoreVersion mdStoreVersion) throws IOException { + Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + long seqFileSize = sc + .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) + .count(); + + final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); + long mdStoreSize = mdstore.count(); + + long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); + + Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); + Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); + + long uniqueIds = mdstore + .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) + .distinct() + .count(); + + Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); + } + + private MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json deleted file mode 100644 index 4ffc33d24..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/input.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "id": "md-7557225f-77cc-407d-bdf4-d2fe03131464-1611935085410", - "mdstore": "md-7557225f-77cc-407d-bdf4-d2fe03131464", - "writing": true, - "readCount": 0, - "lastUpdate": null, - "size": 0, - "hdfsPath": "/data/dnet.dev/mdstore/md-7557225f-77cc-407d-bdf4-d2fe03131464/md-7557225f-77cc-407d-bdf4-d2fe03131464-1611935085410" -} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json new file mode 100644 index 000000000..8945c3d88 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_1.json @@ -0,0 +1,9 @@ +{ + "id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187678801", + "mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42", + "writing":true, + "readCount":0, + "lastUpdate":null, + "size":0, + "hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v1" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json new file mode 100644 index 000000000..c3d4617cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreVersion_2.json @@ -0,0 +1,9 @@ +{ + "id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187459108", + "mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42", + "writing":false, + "readCount":1, + "lastUpdate":1612187563099, + "size":71, + "hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v2" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json new file mode 100644 index 000000000..2cf0dab70 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/provenance.json @@ -0,0 +1,5 @@ +{ + "datasourceId":"74912366-d6df-49c1-a1fd-8a52fa98ce5f_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU\u003d", + "datasourceName":"PSNC Institutional Repository", + "nsPrefix":"psnc______pl" +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/sequence_file new file mode 100644 index 0000000000000000000000000000000000000000..309645a5f2526ed6da0127e67a0664f7b79d96ff GIT binary patch literal 52308 zcmZ^qV{{#G)9o8Kwr$&PY}>Z&q_J(=wrv}YZJX^qeV(qn?ppVK&&QneEx(!no;`b% zL=?aY>>SPL4D1bz%uVRb4UFyV?CC7*=p<~N)f_FH4GgVKF#pFND4V!D1AU|VAHOrQ zv$1zHadM&)HZe7Dv33@+Gd3~$2KZEO&2&XkDHUiFo?j6_Dh_=62K)=~XVAajxn~up zY}u|epp4qB@`;QL*GNjQH)|F41(~zUT%=0Um#8coXMwa~Y&2wpyN7kpH;JuNUmivl zqsE&@w)M@t{h0}$4SjnuMZnab7}_?ZE4peanK_G#W0CPG7El~4`2?z}oeI)^?oIIN z71He9$~K2$0T1kTCBh4I_MUPmr+^2;e_NM4wn79K8$~SV-PyXkMIK)AcX6)q`p70Q zcjIe$fwv17WZ?L|ga0RN?9a)Vm>0Bdng3Hw2zsE^+=OH46w_2nDKvzrqQV9rkW6%D zp%QkyjMhSD3e zsliml!uQB0S5(Ed1XarFac4te+lW|!AHH=Yw}H)91NUNl3xS*7y^D}zFAt4`nFCoZ z?L+r*F!ClI<`2`%Dp|vgf{!h&Pv~qT3$eHIIBviVKZOL2iZ&JsmLs>f;bh<0zcyZ@ zVqu=vw;^-w#e!37rx!k~PjMQcIyx^5qEpiX#$e>7<&YmDB`zbXQ~dsXj?`w~@!mv~ z$kR}3kFP?xY<#nJ5_}qZPd%N}1}4i=XBwj5z=4!r7%Y!1W3{1*n5IS8CMHqcm{roUKh*m&r7?zMykUU^n<;r)=HfFz8=mwvedFSriHxTlF zfCB*lSN$Wq!GQ3)lP7qU{~0|xlZh=%u(X9>2ppzbT$&(A`((XC#%R*@`<2jTbP4Z* zu0g6EskAYbyoe?4&db(YC%2v-o2?J5_W4g9nt?GKmeLif*aZ>^oN@xg47p)m46$>_ zuNCfm+)|m$t?CedFtnc|da!z+2pWuNZAsGZ`52rN!kY^DC%%>+O%1)v1h01>rv2(I z@>{YSE?QUJs%9?>TTcsSf7VN5KgMF8!7t^3ZggR0A$9~Lh^v;Q=ymXT)#nhQscYc{ z7;Vr`mtLEK5~d!s+Pa+T!j~QhPrds0rZ??kWQGgn?M;KD*x--#gmP7x+P_W8d!>+J zD0!gU)9-p%xW9QeLXF#4H2zf8r&=*Ejo3gCB%^V`b+h@)8|}a z^8a;ZgN|Kro8EbXg>l?K_w#_i*Ke<|@Q95QSu?|XIw7H=&raBt?il!Ki%d^l*i#`- zEtar5=tOwPr(UJXphpJ0&d~D>DDh=D{T%h?rvCy7kv|#UoTP6$ZLJP73le=~Px48_Pf)9ZV6H?~5yi z{)b1jFTbQsnBw9^0zfK_-01^Aveb)H&o?U`9;f=eAiLUpDjd*&6pRbRz7l_LsrcEa z?qJJ`1b9Ak(aMEpTU59c^EuO%IP1(*2TPUCqUH@sALVtvAPi$j%w(*N{3m?pxVQVWqwGUL?dp*4P z+tHC=L0Ni~q6agR>|MM>p*_w>lT%Np&@DRjalc1O`k606$e=#cCVv}Du805zu5^vz z`v=aO!j69~_U>Zj+uI#C!@5MR4d(LxBsaWjebd7`C`==tP$69jZBi8te-Jengr0fk zRKIUL4&Kz4{*JV7qU221lBJi~Z`CLZ@k}Vno(^SJZ{mg?_Ac z|MblgY89Rk6jlG?J&MiKZO*sQ=>0QEAO7!x?|YZiS@eWPLgjTlC=kM&nC;7x>>c65 z{O#f`tGHX7{>Au4@?DRVOK@V8O08lbiEt%%2eG5%jg!AdF-;{@qi7tHZ)dSci~W56 zg!s`(C~T{jNcNDVqs2)Qg;QsTR?Sn04_hzuL`G1Kk3|pBSiwV1lI$G~oEJP1=gEfx7Q(4DqpGUQy`?>2L3}a)Zj`2E% z2&A6-g>47Q{1W;u>)1~nB9IQ5vL#h#$}CPUmPSjtof?2D>4-yrC|J4x zqDPf3zOa{a+aB73m*O+E#&&?FdQm;u%F_Kk%he=-U6oxV*A-)6*{ekIM7K!F&I zwhM?AG+U?6f0_2Q4uU%}5DJB+Q1HQ(jMq&{2gm9tes^pGCr8{N=%cuKVT;KpLhVRR_aNUnPA zqRExO1wj_Eu=(j30ScldxoLnj7?Jo;Q(^TXZZ_y|33}t`hZTRZUIyUM6@dzf?wT8X z7r9u^QMKQlc^43)rTl`=_OkCmt*0YcZ@Z4(jPDVe8hc6OfeW8?&gLW}(W_=<{cTBE z>v7A#4Q>~4P?PIcmVqVcWEuiz7|7bBDQ~!bfjXLZxv`g5P}>)-KF?pD4{5WE!Ni2+ zyd+|V8i;Glld}~Qf7O%gsA~#a!3sPjrEDAM{?@KJCqGY?;j`ZU+uRI-j+X(y|0U@287)Q8rxmcRKupI`%sphlz*b^%7Ce8a7GcL z;?0_gns`vz+Jdu?U5HoYwF%@w*+F06LGVRtdZZmLEFLM+h@5QeKqCDCQkj@79qEz~ zHMToRLc8zk*|4mrWMwYGoI&Hr<6{e}lKochOVd=yI@r3AAA+_4E3`+w~s^K9wjv14sGvptZ0ONE#!kL_TFn&`|N3tPvF^yg{^9HhqMSuwrKg6l{>CEd>=7vUQGE&B4_4nakCf=VG(^E1UInGBszx%$6L}@pPw@^u@8V zrTY{gbI@Q%RDGj05E>m>0+{%$1tn?DuFGLFG_C3b_Zh#hVM=+g*4ASW{uz&lb-VRH zc{_YK8jhkR%=gLJcQ1m8GjZ2aCh~3Eo$4i)9btclF59sMr2`dROg*k5=HQC^ zP=bouQMbl~HIY&zFmdJTZOkaKwwFrE@x9gLzR`?{68b3gH0yod3}W_Lsi?zw~vh1k~DdKB9E7StN}y)hI3q6~`4|l&ch$X|8}+ zt9z++O2K#w^Ov?(k`o~P*#K*_d|QtwLkZP{Rba5wf;Os59Ve-LWoxRssN&|y2czPYV)XC zURetz87pCV*$mqQqh>$sn)bN}%zHdDhvvuW`jR6NM`6n3<*i{3u#jS=F;t4-jRm7V z&QWU+7$kKyV$z)f9A$k`W8sRqW%)2##3+IOhWdWzZ@f;X&s?0hGqmjoo?LHC-BD{e z%By>rt9ZsADSQ6E__|U|=yjC~oH$l=(m|{74V7qGfg=Jy1R6DZ3x&y{^mbrt=tfiB zS{)Pr{C-02g`WvJ2rCK>9$ZnPePGSxZf8yyU})=&ojc>MS4_3Y<0v9KBHW^3rIF17 zvM$qA1Lj^4Kq9U{Z>WmN;V?>9Y<$Ddp3MV;f~nWz@lwre8^Q6Uzx zSj$tR@XGt>A?zZNEtV~}DNrn~uo6^esy-zxM=7%@^05ina?!IWT_OCm{7VL!@F zpY$eRjs*6pOuY>u_u2GRNqN7vns}UA0MK``>zUmd@SO@Vt8My+-O3F$y{lLthpSb! zdq%POe!@&fD4~c*bsnig;M#Nh+my$#6?0f>!fEPUTy$= z_iaVZRMr+8f2@3mIJ<2t0__JUhPRbSfYAAn^sb@;q9o%}vrKxe35Ir-Sdr_tt;cI5 zhl@V0jI;OpMfcV7OlSWu5_eqi(pT^Bu^p$wQFmzDD*xfugNr?}xe2J!DK46n8#Op- zNfVn7Bo{vtLa)TIqmz7Xmz@KowXoIno*n3!X}4z_HWV*Mq^WI*FP=nGroO8;3{ z#!HJ8RsA`kF{WV=U8jwI9LJR6>^g@fZ6<0fbcaD=9Zc$ve`%zq`k6?~zY$U>2Ym10 z%xVfPZmw2j9-9PIQ1$)F^+7UOlc!Sn=2qR$gj17WP$A16DI!2fV$q|w(1bijcN>-- zBC&s~uUz4=f*|IRY0Xw?ba*Wr@ z3!`q3P7M<+^N;#anQ$@ZL`?M@ccC@;#+vW0Wq}m7L=v?J6B_JIf)O7+t?M^x>_= zjQP{|*=j7AE7MGmPH29EXea9={4aX4?;oBJ|G^VrR-pJ~?NT@JU4{?Af@u~sDXm@s zL1Jjer-jTJXKjo~8EUl%t@{rBHedg3-dmShayB4b#8kjH^vrp(z3Jui{h!u@JgsXIZ6BRBdr` zoLqjvp8AQ2A{8GEttrpK=!2J{!X`vOM%=QZG3FDgdKbx>(1i)?s}HkF{bRVj{FCn} zN|SE*di)a_Mxqe#?~Z6LH3_Uw)+0s}uEL7W)4X9LL``vKTLPCk;?e=&si=;g@|x2- zOb6qKQg~Xu$3f2b=Iy{|!f3p71UxFl@V7N;*yT)Cpv_9o|H8eXl@2PzH&mc$rA&f^ zphSHz5G&4Bczt)h_x0xO!PQD12hQ63{QU;v_m;&Stki9VIUbNn;?|^1NjohWK^mFu z6;qvrh7?L`ris?(=Egn{Sr@JnY=TAcTt|BgrgMaq0AdDAU6I<{7t)+Kgt{DFrC1Cx z(>ztC@fAA*dx`=^SC^&DLs^HXi!9_t`V($hKufgYRzMZWB z^Q?>@XmXfJycXJ~$`nSe!cW($7Rel$9UZRHFMsaScAr0GC(kjmTLPK7lyQq5R&vSG zR+g~UVOR?;t?SfF_BVa_*34^2FG3B?yYqZCSjXDyu1~$Z?;zggja+~>JN}R3?*NWf z0qz#SMZooms6y`~a#$Sr%dr|6?0-2%aJ*95Uk<0;Ahf|Zv}3u7e}-~OD1kIKU>A>A_;|6K<^1kYY z=km4ZdBrR6J4xA@uwqbxGECXGrje8p%_ffMOHyE8$@>Qz8|fx5pXJ1VwAH&aRyfYC`)e-@8WM==eo zW1SPZfKJae|4U~@%hFPbze$#H%6<$MjzbnN#AbY>@b>EJ&>4Vn@riXw-2qpE5i>f~ z2psARO)-%cC$p&$84L|hLk4xHP(f#B{PZiV#kd@bK6VuuEV_F(l{yX=vbs@o#do%{ zY>JvX%`kxJ_6r@KYn5ok^9w6ywogN%5dCtlz3#WsxK)_#4~H)JehbNQEUnTg%9vbI zigOXMxl5FKS$mJn2>r-k@%yGC4k-+V+`&$y9Lzrof@sr}4q9>o@L{3uXI6)7tn&2w z9ouE-YK(~oT56Vwd^`)adxX*2EwJhpRky$-xN3Jv%Sg&tpJ{1o$!(x3*uW5uJVXen z#FzE?PQ{Ip<1wTdSg3;CqTsIT%9@tML=#KQOsA!GGY4un+8MuVkO>o_y_isXHD9nZ zv%lS0zuzrzp22Sl%wBzn@wU1-@55!hItHNdvxq7~=?YNKBbEI`ueW@bv(W|&&7}pu zfVBwMpFb5QYb+NDa*>Z>2Ttklla{&xKpX!98uGuIbrGOh30d_F9Cw#I50WBQ9HT0d zKoqs8kRXLs^2a?Bsv5R+u58aq@cZ!z`NH+|aYp;|G@+L6`!9TMrrk}S-!`D7X&h}D z^^>KJz_;38$V{s!BIL1^_20C43ZR>_>)}&xH25#;ZPz&^kH2GTdU8n$hfGA4=M{q^ zN-;u4EAg-}#msm?T3x!u0Cyz7@NmJmOMG-waGho6KF*HJYv4TXFGkWsess&d2f3Y! zfqQteci{R87iQnC*Bm&$*&fYP#YL~GnfJYC1#|>0FX>~yacbVtz6)*)f4?f+eYL(a zJ26c`rsQzXU)~%4gq2n=Oyu;;TNnDJZ+WyDHHuj)N?fxNDkC0jvW43$z$``GD!(oo zzA?~15%s*5jO;f=A=`TETZ>*l%f3p?H;N-0RB$Jc6MiIo*+O6Iq7vyR$;g3&U85~p zm@BW)b&E6^WCUr+c8c{&CNcG8uyXS5=qoM6I8aib0da}o$^r5AY3bZ>`oup#9~gUm zY2-XTSQ|YL6!cefn!3BYxW&60P=)#?k^I4W!B(HCg@XeQj01?P4)mougv6%5&S@@x z&m!z-zxv58+n>S*$Z8UQ?23GOIjE$Qj%XQNUnRisUbA4eIFT3Fu1ixn^b$<$J)C8k z8EAKerId?n+FJ;41}eG-NUKTB~&4`oy|!xguOmR-B^8l=G8> z-@x|JObY*tFsT2>AJkv|{yVW(4X9aroJs|tFAB@h9z?_`3zDJ-i&tZbESvK7p*41{ z>lp45zl*8xm-o&wj?`ceBU!}ZW883>i87mw^6PE|fwwU+YRn*8bWWE$apkedD9KsG zP2HBef4iV$&QI+Ix^lN!kHYfWLs5qT|8o^g1b#N8u3wh+9m(Owg1Wslv5)^*NR!j< z67e&b(McaS^P+6=`AhYqZJD#!_sad)i#=#>-5l%|m64mycCBZr72mzFTW> zOL&7P)`2+2d?DTYfri8}VOB29$d|Ck17CWM`KOf=Ea;Vj>^<((91=QUhNYML@e;#Ja!Vj1aUM5EGQZ$faZSm;TF z0_&Ke-HHzBZ03rjoTF7wJ`TP6!1;KgSGd?h7nBK08Qr9gY|}??RXi?NvYa^RR-%Qh z!kr{07%>nuZaBFzaZoBds@BBKqggBR>nEc$0cRk?l4Sl`eyMCw0QvK!K&XU-O-Q?H zxYSqHz|g3Y;U`td&zBM02WXs||Grq%1JEwMzs=rXJ7sfgSv6{W%FMMn0?e)5`+Es)wW`rhQ ztdgx#!7Fya;Z_tFV#y6NV~ANY@?5@N3{fgwRy7&T{RaKxpcnT)`!^)_zxy|L$E5JK zLcv*Hi`S;xt_Gpo6NssRdg~I+=#Gn(bMoeqy=t%1Js11i;ZnbxLcVqEIw7zw7hW7^ zyC45uz;QmKp*ar?3|ukGOK)IuvGPsJH#~o$(b|1uOY!6a{i5OM()gIJFMm^_;Aj*S z1?-n=^W~tEv;0lcyi!mwq~2I}b^8`qZWkV0aTC@T&cSK*Eo%mwDfvPWR>F&G4vt%( ziGSZC-qcQGKXpjm(9?%_12d~4BesYQ0Z|h3H1>E9E^2pC2{&|CH!=XV^u$Wk4}`>l zUIx=(u~$-6wS2Vp{WCU&_SpBLL=yema zJe0=zxHy6DLV+ZSXB&x;lFoI^I%y-ol4{p9q!fV~j*X?wN|TYd@YV(qifs5!|I4z* zphK(!uvANjv@WYAx&9FfSSJ|v2tV=zqa!p@#MP%4NPFQxGE`D}DOhZsm;xX5GuWuo z%$(t@CE&Z(jKAsBg36^RV20axWEq5SmGGuiB#B#cXO|XFQincMawd3=SW{Liv)w9# z6~f5nE9cc&Inh`+1oG;tPqFj`nzF?4WWLFn5*^e;(Lk|O=M0J=vN-se=|*UEO)WxS z{9%1HMT>N>1qJ~9(7!$nVa1M5QyLfNsArOX15A9+FZEqlb#aqG%Z!> zMOTPu9+q%+KP#%(C5Mok7Sw^V6+QYLG%pP!(gJ;6TseWhCv!aE{ zgNo*RPbJ<9rr8UW{cVI2!1(^-9^ruzyuRwS3?z!>iYVK56HP~$gu^)wskdx{_A|A%$pA`GX(VptL ziaJef3YxR1@WB%yMS;zER>t>Ld1X&uNHq?y&NxrJz@8%%-Kj4_8>(KkETl=37C?6n zL9{hy0+Z6dl!(_)ov@1viBc^_RK2*ODry+cW^DpeYDX(B-v9?P(RbJd!`>*~gZHL+ zmU_s!fq0!fBq@O`DwMo>bu z`iqauAx>6iS61dfGmQpPiEmW?adIb9c5yQ+uwJNvu)32Ph9xCfP-U`^PAZr2UxGw zPw7Mx!Bsk~WCRW91~eIdf(oT#mf98ar2i0 zpcifan!6*DM+=Eip!3c%*ZuB_qpw|l5gWSD?4BQ9dpP2t>)ltuG{6SGqS}i|Xm+j7 zDT&KU-RJIvw0~{y2wg)F14^MmzT0*3CA0d#^y74bAL>(f&ejk5uGdHG<`8x*9C*Dm zEeawvugF04+$D}Klp0zSLHErYIKsTsUQ#AZieiGLH-s7^P)E{-9mr>vqATNLxux8A z0+TdFqP~+c7EDMR@(`HDp@-bUTZg1qj3vxG2UDPDn^wS4W0N8eHBFLbR(Oleeheql z2Qd8Azh*4uRFFQvV{sx`vzk#wJ`V6#(3fJUnDM{`>Ryk*(7x z++3JEcq_fZJ8(u~(hnsvUE#P)M5h9Fh*M;bj3JUmt-O{P;|XmaacDQt`6uZ~Qc4Y| zU0E7U+oTjzVz9iSLswEpp%5SwM@afT=kVn&3?FZ#jOQM%tjd5F-HWoJ;sO!D8Wmb` zi?uc;;zw7n-^x4?Gex>_n%v=vCZYXQ8OgUeNyy3>A808nNe&_^n4laJ3%FtlDKf`_ z>61r4pf0|5sFL5th-}5^QI*R^bBX+s%RI@8T&UI5NDJiVYY8a1w#B-x8>z(k!`K=h zUebL1hh`Xt<@<@RGiw)4%*RHZSz8t{7#T^82@qmxogxc_<*tVJdA1@Za{r#gc*8KS zez}s~Sppj`%pK^ceXh&RfWj-JX%{Y0w@+(DRI2msAwOPsFFNog72pY2`v*PTUzww0 zi^YgAs+XBHa6~9)FfFFoTp0|7SX2T_@}G?w)d4OVgppPYJ2XxKd&S>9@HTyiMI(|7 z0+KuLpBGHiqtz0nH<$ERt|?)b9mn0lYu&iehe$c5&Rg&}<*! z;RC?p5Sl(*5$X_vOq`BW5*TtD(@UfBa`t~qu=gcj4T2o%3W(ESNU2urSTt`;uWXo} zJeK&Jx5|CWo>=|-bIIM+$gW=sW&X#K3sW~RAw;8`-M0g$n$}#v{pb#pqTp(RlNE@r znBw6RDTN;>Q@lGXz*ise0{A*uQ{^~_hMOo};!o{_0M+0eLeV(1lb^kHBz~eP@)voS zyuNw%0*~7`6vn9+)_@!N{jO*xBzKC;5fR^_|q;dOW5X2ID5Hyq-Hn+Vq9 zX^;^Ofi1$c?Sl&AM5a3AR>?BV!O6C|xq7p*Vrvu>jNIIOR>6V`H_4ZOYyz!#l5VIE z-ebTTNIjN-qEvJvC{>bvJUxVZWs-7>8VO=<@`pZRvXD~(sLuf#8w}%^-!OWN=YeIs zyTmK24xCE6fVVJ^K-ijUmKK+w4DwQF+N7eLR)zXeKYkEF90CI}VGKNL&@o`C8_mx{ zDPyLWJ>$}kUF)KGuDDo0z+Rzxc)V(gdE$R_KJn{hqJ4>45t~Ax0*6sO_(3rz((#8$ z$BnUt66UH(a^|;CjqU^a1;>dlcIb4N;ySpwMy?QIv}f6l;>7c2g_F~rixOEI<0=cA z7(NL`2-tvaWM}3MJQ?uL#&Ks(V$>0misGMPk{T8n(5$yKp-=v5rtLdEYKpEMVo6*t z$qJ^6_(DAWj6BA~D^M9|@V)_2Zizj9r>e=^1l_pzeh^y zCV&*(GbNAjC3>`kRtqJRbd;=8>w-$*LW%0KGY14l0PZhY5T2oJ^L64Y)Z?L`k`1PL zH2>ZGn~!U6x97XBUku7)MG<3;h|pUW($NyYl8rPE#sR=6nfF>tIm*cI zFE0a>fjzEJP%q^w zB27lg$t^(5DjYA$fLGXZ43S&$b+ne?ir@qpEMDVHlk`w7*YxOwV_ zJqeS7V}ZUx+K&QeKE-FL1O=>dYmpxNs1v66`W&aM_}0x7Wvncx1-R$om=w5n#f7jWOs>(3!N?Mu;sY=?sa))O>aJIhLB)}7h%L!Zhpf3nHZ&xbU z{_;FBtnai*fF7Gmk441k<5J;!sKz5^WRbj=`~jl>>ClO7DmzkA+A;X&NUF3D zlS!#m-2$e~3VN*xEGYpSl{{T9)5Y`o8?HyS>-c1Trmik&GYN)8p}uVkuX%k8Yln}D$hfonX!TSeEir${dd}d)Wu8J7J{p#U>b8{Kqq3ep|$|sAV9RHD9Mi zwn6+(*K^bP;AVyo*cyIEk>L1ga60V{#x?xWy<3SmyC@SKr%^TD}%EK z645w3v41OKiSVN_vm|_~WcZ*8BrOk^*Wlw6g!T@hAHp_D{u1J#8;g&z_ z&h_;1xmpEF>@3gGPubJ&+gl}+5#RxM`&R*o|55e-a-k7(&@~-xejsSYBE@+Ybur(A-n+LRr_XGoTwlNE!PFmZFj-Gj zh7~*N=CnLP2_z2f-$F<==CI}IQk@RbhOF>b7R=_kS8D5Go`X!?13iQ#y*0ij zKOx(<5QbbyYTQa}zd0?GBvi|*pT{$MVom-;L!c*UVdk;ASs_?KXN58cBQwN2sxieR z8VeS&Y3+W}q4p#fU6|hDu2#kN!OfK&16F;IvQ?9ZLd48X&#fAG z@vR3@L*8C;q1l?FACYZ$JrnNTsl4 zGbU?mV3U)o_ybBHDrD(&!cj%Q^t9dczoC+#W(2 zC;=?CQqGJh$TS@_CCsBBeKxPj!iyR2mH(E2GLH`Myh~K_NMEase`HG^sY$;fWusMN z%q&h`%?1~ZL%P-cWO*4_dL=#VK_~0PC#~bi5!9;fH)w@zS#i)tBwK-$JSL2mx=S7^ zCjx3AP%s5oOZh$t4}oq4=$t5y#gM#O6Bv5)xf|SMfePeOy1AXibHda8^)ge9qCihE z3K-UDuv+U+zRLZ2lI97&Lu+s(3=PARePgXF93y~&9eL0MU*saKa&G*p$r`=wqaEtiV=&$)DVhpKAas4Jd&4 ze+7U9D1iSaN0OhJl520>*ObB`lPn_zOQ7E6lUvFNY&2D5M%sUd0u;WIIlrMF%T?kt zw3GNigKhlj51!rEET`HT-ga-VFRG;<#WC|_djpS|QdTNI?>xJo0jqfyodyVi#{=!{ z z>r^ggzIXTW!+s{t68`qNImF%mnN5*5@9xOk21W|osK^ig>(1Lq0D6ErHZ(7+C7jw) zP+M`n<(^QppLMNysS@%OLUd36D$q*_u+x;JNi_Upngb%FjcE*=!f_+PrpG=`&j@Rn zK~6k2`vFEWx^F5_J*QmphIU@#Ke(cP>Gd0~lkqYZ>*5*>a7W35gDv zYU^R3r=Iu3k9E7sM4I?lHqzEe`JGDu^4=I0A}dU!x7?i>YG)HR#B8y#laN`pQNxl@ zRH#Ki#h%5GS%w89a3pK3eF@SK_!nv3-SG6&@Ea}FfQuh~`zt!?y`eIQLt-Vhn@El# zAUwzNle_?5%0HkvbtvsL`Sb0d^CQ8CB^i=Y4zK}2WN`yln-M1aO&Qm{=3>phuT=Zh zb7(R*a3!W9MZ|2zrtvG=4lc8iDKWhIh*G78TYdN;At%w%e`0-ozM%;NJ#`{tka0F7 z(nw-*^(0VZ;t4R3g1)Pk-lhzH>9*1+ZiTu8|K2X7H$cy?an)!F^wRO>!-!#O)N1!w z@<9{zmNZfm79lc0RnVybya!jln>ra5BkMAu z@qksZYr{}sFq`UwcEkIPU+)pxBizvQf6*&G|6oP_3+sPV{6dOZ*Fc`^7eIeGiI`g;k4b!wbY=zJR&=&UVFIwGLSWCU1} zq**c`CZMn=a{C;^(FZe#j&d2u&?54J2?}!0sBfs+!)vuGPmqp zE+v{qoQfc1*`cl-m9ETUxf}4o#*P2tYGFYrCd6j;7gwc!xTY_G0dNHa;QF*Y{06(u zx*7_fZuLh86hF$6pvKW=k8WJ3+`uCW{PxFV3EEmcTmlAichXL;j|)AgXlnz(@t&Xa zcEGXh?n3(5kRDh?mLrpcMpj@QX;js~WTHMLs#`)~c^ zS<4P`5C4(V^~H)9#u^hWBe;_;Dux-->e@N0#<)sQ(CM)rmQq!pM+t(FZ8|EqZyNj45}MW zTok+TSv5whv4qO7925qPS;O15QaY0U;2b4-^{oPD^`nlxCrz z3K{BZ%|A`b>rDXB%n1nx4j3dQ@C|G0OVggky@D~nV{S>Nwg_qKe|+>9FN1A zkRZ?rS;f~WM{~um42V$7I%PtO*OMT2gPAyS3ZX-gm)X6@kM3;W6ZH{_KQtfBBPhz0 zF2Qh?&=vaADoa}TYnPtnkK8|}nBdV5VI_@(-+u6po;^QzeC;JOJ5O_C_&(3wxDeaj z{?!Y2^yDW8@C0oI8I#3YFtO-Z#t|AuSZIi_WhFc+copTU7H{L4Vgz+d;hcSb%Z{@G zC)y*(aji*xfbC@$dpuYa(kDyl7v!TnF(uO6+7KdoyJ5q=l=xF)g#Z&|v&xWi-o&WM+Qv z$9(TEn^B&0oE$B$!0U~kM()ikxQ;j>7VDH6#=C0{UaUqzv38>D*a13CV+H9 zV8N?`AIl4pl`H(wl9{S1i4JhPBlUo|6JR|2?~S3Xpm_%?u9X5^ol_5>)84#hdHWR# zfeDw5X^=!+_*$7p!5~aDOd%t2s2Yo&meg2-Z9hne(*CXU7J=Eqgn92l{gH$_I9S&`Zkh ziZv}Bl&K9*2aew#B?BuzMWBaA2SXmGas3%myh_?I`y4?v=vk@&PejAV6or#={;Mt=J+ja@tAZc)wTB8E`?1 z#?_FFzg~o4iE#)70rOzdOR8ao(9?~ZgR_ORC3@mq1&BMTDTbF1gpVHU9s}1Zg5BjX2cfD?M(9VcN9J zi0~wSlL&2ZWbC#~c;)Pr+qa)}?4~pzlAu;yvJM>jB0JjZ0+S1pwlQkcm@Es-Wr^t) z=uk4rE6UqaH6#=iRW{$lO85pgbqhAP#1f@k{3H^nNO3GPBf^HVr4iUEL)o4$`(wZK z9%b(M_+tIW-f?zlR(K$Fl6TQ_^ZEg>pf8>?+M2{sEu?>o^^@oQ+&JVe9W@)p9*0&Q zj~i6vu|bosgHlaT`2kinSHcO_j3_TP3jPvSJnmBs63`8XcZBQ@wZFE4LO|_nEf;Ri zc1^?$lj<GcTgf7TixDF5I``wRbnqbX#1 zd-rptyTb@cOO9a)EQ$(FN?}VC3d*z>Kx{2;YMmDtuVKFxmgUaroue3CGTMkh-~5|T zvM^`UBmG!H{K3gwPNx#o(*}>1G>?!IOwy@QzgPT9;ty~r&WWo$06J^4S-8Am380)q zVEJtY$^&ZfmUf_^cs=TUNJX4hX}GfiEq-(-JJ_#>pa?y(ApFE_Gi%ihubI7Fz4~x> z{@eq0c;i$g<^|$f_VcMWG_{odKbz0?T>M#6Kw#D3=vho&c>^>%<7bk*+1VSd_HWWP z>2ve%fGoc|^Gl|Y(iz3^-@xi||ECp3TtpnCdPI9^dr#TnOWYeFHPs;fOeU(hfm8;y zIp8P?mtN+Mai~t3=1$wrhMV~_JhrNn<)&!^@6FSJ&sa&^JQA7)zWmMrqEw`+slIuNDZ`k7TNa0u0MR7g+m8=E*Zv8Gy+n&VbG4JF06In3`5GrE z=!z54eyC=|Vby{23l~lr|Rca!rzVS##LaC!Ot)q|CFF{rVfLC0h8=1 z0}pg;>w}4NLW8VGIl=LXdROBTChp`l-vK<$>;W%4JWnrx|D99e^yuB&4c?8#woIY6 zhhP``4NWKKHl+ZnNz3EQ^SXlvE`&UFMxga6|KRH7S!+wRHfC~6xlxmrUy2bNHh@?D zm8krnqK8Q6QG#gBNLj5+m&`HL74D%|DagLePc zu(Nl>?SlLeaCdyBCHgay-7|>Yq!i53lc5{KU-+L>7lV55-i9(Igy<_L)a_JePO_v)m%U&FAA({_2-QFm^mw! zLG!kqAQjD-E#S;*qKP8^8UrllKi39z)PsK_<9q4$0)6=j^ytZyI{H98d0YeRH#6^Np8sL(eXet_<9jSYpyT!03L!Q?+#qsFbg=-Nb(;xRXG#(H zRO=5YH))spkWu%&$=aQihL(pD&!SBTt&OcA+m+^wM2-dMVrBDJ^x9Eo+L$f|8~z$g zKN?HAFsfq<9p*P9qnjeh@DBgSVn{u{i4(%p7AqHtr7PaI-YqBtf{5X<+Ur<7NMJppR2 z?5bEN0q4#Cp>(&>bNqQ%+ZU)2M(qbIzHDS%Wh`ICu=NlHXr(P_k}@`}AB=P0GA0|R z)i4+9JxX!s-X*2gpEN)(T&-3vTjB^Y1!BPR;0h5A&4-2M76AjoFZWH-ZGiAgW~NAG z{U-b6QMiGf87^`AqB(>>#as7p4m_}7{b{%u8C#_qR=J1w`dlmdcN<7FOP|zDB zNsu1-c7}BJDvjAPz#Hl`il|jSoWuhEO4qaqDn=Kv$!&~6VjHUOg?CvhLj4#RiDtc& z>AKE=mTMlkgc4QX4-hT#k=C-bYemL z#ujI0?hR^xlE*>~>&Bsn4F-SB-SkZ=EvS|e`BgJPdy(zXO8;b(aK>8@_ArU<4(vYr zW0AcYh(S@e8gUX+mGN%3RPJ`98tH~b)M5uT_pCs!t{`b9z)a3YyZA%h7!apDmZxYS zVkKsH zRc%d_1MC38l@dW{j-vx)SDdH6LLa&W&2>Uxi z*B_3tJ9Be|Xp=?xe6c9{;>X%@?E$VA3P-Qha2klkHhGVLF`&Xutrx_JlIJA z@&3yB6M9V_9rbDd%awaXzyy7kgaxrWmwXD)McaY0CV4P`dk_ERMw-fue*Wb~ycuFZ zh_sj-Ad}h8#1Cp5uSKQ$Q_->z(N4~=3PT60kTe3ygl@R!aX~_~)NtJTNL`GV0&tE) zxI1@WfZkmkxQiXt;#rZac|ZXS6*cqIK^=7?fN(W?QL{BRSbT~@9%TGQK*hc%G~P}# zsM&S2ez#TnW2UD&lI7{q5ZoWN+IE z$>lZWBdd}tONXhK@Hhz)`_3d&X$2hP5nEGGWRX&h9H8e}5xJG}U^bxIW^xk2J41_{ z{)m@qR}4+sB^Y&yLBX=&wLb!LBzcHqIU*I6|B6=TuG#`C{Z=}$3`=ckyZhY1nqWW9 z^kheE?!3vD-X;a1pe8!q#Oe1yeeUu~DempdM)LgQk`F?Anu%%&LR+*1sV=gwS0?!nF6-s2Ck?uOJG%TAbjdCw z02T^(^54S7`Yi03znLyvu@{IMc+Y(M3`lb!^O9=%GAtCKxFvowhqKj%c>oYaK!ab$ zmwqq)62u`r3no$stc-8*lI>_a)z&>a#b#>=qJ$Zo3{7BN!r&+)icw;YMGhN}!MW?I z2Vz|O*vQtaKm9Z6{9J|`Q!64=JjpY%gXyX&}nd<;Zz zl|<0%u$pHs4R>{z=ems#1@{8;oUP$kJ&v*$yELtF;43`XurG(XN&7Ns%84{F$#9Xx z`Sspk(=l2u#jPVH$u&%UU{#qxQz)76{n~a>K7S8aoEQOc;BeAfcj~YoLp4U*7tzjp z;+sXTr+}HznNFsBuP*I?y}DL;ek%Gbj5@eGK-`sv+6LlG`gHahf@2o`{x^SS@3?`PNZJGnUs1pkV<;1;$5r)uv-2 znOI)tFnO_p%^_5sAusFRLGj6jn7oJ%A`5?KmorON?8`u}7yR9JcB-2wowL3_E#a`pn+RCkAXfc~h-S%1{7XBToPsAeHQ#k97gbd*Ws(SC=s>TLXb@JbY z*RU;w?Dn0(3fO=-2C%R>*?fN0j$)wr974L*Au|JUvoY80F2TLX-+}mH1$3mCu~Jv* z?#)Ziu4A<9^X|k8KM*SA_Xz5Jk7>!sINqU~r!_<*c6nxf_*~pP1222#zUE-m2u__nxUTnW+kUk7 zTy%uIXFt&N{&=YBY+BU&4(a5EnH^Kj4>b`fC)uNhM+ZYLGrO?MAeJv>lJ(~c8#yIU zkH{}R2qyGDyr6fze|SJy{$=lMCW77eq;JH zZBz+dWTYEc#N`Lna>jqnwF zR1m;CT%aTWtLhWVJRFq1M5BHn8xl)P+z6)nKGeCZ6$({;@|47pmj}eWlekoxwSxXu z_P6#-3I?($30RxLYrx2Fl%MPaL|2A(l5ALYDu1x>c*kn0E&3&I8)~loRdN|p9E?hn z@%zOB1kE^=9jTHC=Pt1V>jUO5vxO~GCCEmRV=O98wN*5mC~ce~;g)T&=>v%$^I@qm zf!3rp;3E3~oDbw8*M2s8MmzdF>x>S->n=ps!$EjLq?#ufwb3eWaK0$&(>MroIsPDg zgh8>(@MMOp;mh%cl+SCe_7Yc>0LxBjRfKk1(~B|(qZIpvvY3;GV#dDS1`+d5V^GfKeIAu1N4GFRhFIN8aM#-05nSqf69T>K*K^C^nS708|`S;JPiLMsc3lpJV*#3%8K!IHKqhRs)lG%gHTD ze~XPve7Ry^azaZ`FXpc?s$`_LOZ&AT^#z?VMB;b0yD8oM5GihR2B$T3$>fGiA}?Hu zjS`WJ`O*$kWyXVe8C7P|7Ehfq5H68gQ zg1vC$sRJ;ZY5sd+asL@dt+IZeSpS>BYSI}Og>T3x@M#$l`sUc{oZ~%TYQTdcvGzp9Sf;O8fABb0Mgu6@$9e=teZMt4waX8(Hx8L1q zI|K=t?q$EXC%tS^XLTg@wlIB!RE(hx26Ft#{6lage?HeLK{q?i^Lx@OE>*G)F2)Mr zs&R&qWt|O;$}*2Ntax$a$sV1cvq5+JX9_FmXH&q1IxyPb9Yn#^v5yHmv|4xC16z_0 z=DHm-mEy=S7AL0=B}ADVyV@R1)tpyvXrZH}rIzE4jWi+6bb&~o*RmFB#D|VM=XI_x za1}l?FC>Usk5-9rUfrL;NR}Z&dH{T8F>HlyAS63Z#|x(r4oyl(Yc>-a+2CFXa~FC{ zdv1REu@-cGRCwiM&zt=%TSM>P?0)kw;|T*(@OFMc_ewQ!WNBh%Cn79gPp57~}wQ0FOOjt5Z;}J5{c#y-PoxZt8 z-=wb<82vs{^Q-#(6fgWBU9i&{VJnADz1gjZ@CL)BpyV1;3>%O8>)WfUW;dbf)gv@_ zdP0bGpw! zaq$C%smhfWG;_$BTVVd2gEhj}v88{`d$V4hF%il_%h8*>KVwio&Zkt_;()59q|N~{ zwEm4*4qS!w!?qG;F_TAyHNU$=_30^5GN-MlMz)R3h~!=a`bSGTBnC0E1K^bKFnUBL zSl&qCdRyl>w7?c|vt*QwNm?If`XX?w~iW93O zsHqHjTSsb0Fru|FBiJzgeWe(E@wMo9f2!(d3~C(^gDUF6TJb*~CoZDE69hKaGv5N! zT=k@SzlnpFyz#q$xLsX7PNX9+_9m#~CWa7FM_h5ZQBtmROBSl{sB*P6q<;GvuoEbm zi-1EaMGNQ2c^p-+xR|h8@T=gYT`Dv^!LSY?nq*ID9EJ&)8Zlmr9CR>H{pUzpI01p9 z@rZjFu60BMvN~zUMjE-2$=L6S35<>cG&%Eaa17FB$qCF=BXAW6BVcTk1z3Tom9QPE z)Ii)=rpTyRS)?T^7w9!-#Borxa)(-^z?wsm{ps5I-f$A|Us2P8JX}TehHTWPu-#x~ z9Ci*)ei7=sPqtlptw9BM)va-CxA?nj8m>Eyey|H{yDgesLGEA!nI)A>22v-*O=Z!C zu${olpjBnYSQ?y#%d@-2oyS^X3b!}AeG*zS!#^wMh4g)^9;@K%QL0T%7D2lD@DCJFyNH~9ZFlb@d^>uZ!s zIuF!{dTscO+*YwH9Z<1ZJ(*eqVfpF#x`}i`$oOyrJ^)|81mNodU#98>72gPqZiLYr z@58yLf_INA4#0ou)GvR!(4(B2sKEA zELqtIzk3~0>%0`P0yWVtN=&5GP*0TN3I`t#!J{P8r*<=AnH?`G_^PJ=>eh!@TEw0) z>pg9@<`5vNg?s(^C1;(+=CCJVM#jzBRlLdTVXu#plxsCbFZGUt)0x6LukijAr+N>~ ztL)4L=w9B=jrF0SDb0Wa%v8tBxii%kR&iz22rgUyR6R*gYX2~j3=_U6F7VDC70K#2 z*Uv?-xVD~k&@fgGA~`yLrtf|!_2=u|*Xy149YaM@p_-8*HRvhgHZ;j!m0v^y z30<)SF!iJ99SWFYjf!a6s+tO{on4Cl7_o6VMvYl@J>QxKug`q_s?iD-9CoJY*nb}s zbcUJMpS`RtIh`Aw{|HiPWV#1#NHh-*sa5w1y!L=d4Te^JUyB-ep`wy9UC-Rx8}GNP zlKO|ovxxrWzS4-M8^oO5&f(UtLLGUqN2p*FD0*Rdr+9|=9q-V!7(FDwu!ZLR?a=lW zA1UM>C`^1x-F76E!P-_ap-RK;3z+|;8c%Pzg^Su`7N!)Fx-$L*v+e3j)asE#b=^oJ z-$S#nDk0af{JO_6+$patU+OI%i0w*sMFS$>_~>Y_@o6o(_Ls*xs72Lo@1B$*huLN+ zF;FkHvti~g&@{5Z@cYV3-ZMuBxYAvNovDDEq9z^q43kJ!R0q zT<%p4i+h6?i|}Yc?Pknl`8^{e+<(Wix7%s$acq3u1Jb6v9fUenJd|Yh`U>TO<9Y@9 z=D*)T2Ni6qFB@)Q1r`y1nihCj>Eb0wD*u{A=+R2>`Gr}-{auB`OJ(u_q_=W#0eE&O z{`aJO12l2}f^9;pih(+Ii>;&N2J%?U&K;>3tBvQt4ALXcGT3a<*30L!*9hFew*GF! zIzWdhFlc4QLS3P|7cbG?lddv7-H&LL-@Xq{k{W8sf`>6M@&k|}Rk1C#xIkWCmmrP& ziUh~K4kG4IIQXBSb}(bzxPi>zHb?#|MZL^=PUjL6VN~i{_#veY-fmk& zV(@8Ai2Y#TDOyx^L8&F@_*3xdgpwkwNd^O7bp;L7ze*b`RfHxx;rzn zv)8iM2GO0WYnhyU-XCp#NDL5`aH39Tu6RI#P&kjcGD5{c%$^>&mY2P3_(?%BbhOOy(|AhCVG^sIN1ONWvmvC(&hp2VA)Kt=(^P zl!F+@3FJ~+(K>9Mz#5B&RG#AT+b9{V325r}Xy1)n9~znaCHbi!e+8>=Q`wkuHb?#x zTj4BY5|L)xB%1j z1=dk|cz*qJ{d!tlB_>`e1a+`NTEN!h51>^v0I+;L{)_Aerk>wNvAnr^jMDQ@*Jrl= zwWOab3FvJ#HA-;Nc;@X=k>|+W*^?$TH9{I_79|EwW~y24>Qb|RbUB{EmJ720;sfA~ zf2)T8=H>fWTCa-MOO+i?YL8xuF~4ySqZ=f z;n92`IgRZDAPzCePR*C~x-cUoMSqrks|USo5D+D*JtStD&kc$LWH}(7CM43!Ul1N9@wW6Gc@?-HNLJ~xbWUR?C_&F$*t09=I|l__U#VP z4ozCq>Y(N~a4l$;jX3$}#pjnr6TmOUFl5|m657D}$G8?GY)@RCqy0Lay3jqqHcpkd zW6Vs)+i{7KdvPM2SBkMzV>jYS;?_=hdVA}=^`WrkL4;gNtQuVen?!<)05Ju{Og@pw zH98}{QYNJ&T3!h*Lc8t7+(oH;LhGXvw$#WgLsoVHDzTk`o@2~#pc)}yg1)@W&zB&D z=XXljA-!uI=c!hb*pZ=JmT~rx=jho?H^TvxKVulyJV^XS zVzUN;mftQC8(NII9$(3S2EI9c9Z)bnTDE>q&KNvEzEjvP-d)NeE1GFeG`+~^C(Hcj zp>s^!o0*y8`A1=Auy*!^N1`!%=^=qWtM%3y;c?RhpQ;LXTp`2DCVPfIs0dyy6K_X5 zH&@q2V%Il~xXiRI^@7MPd*O=QWO4%9vAN8I74plD8~|f%D+wTj&PcTk4xqB@UJ(l% zB;BLs0+_QVnk4$ud4d-@7Bh{@)$Vf7VJ60xJLi zzgA*|(^`JybFD-HK4n+xKS&(uzlo`Tk+^>gAAO1UW0QmP@G&)~W+)Hzr}1fJGKFHm zEVd;P=!q($h2=P%rKS{}Mr`MU3;L8jn0eEvGXz->=akHWqnY0FHC zs{VyPo#m+i7Cyja_iy1N{8lKreMM5SgX278O!MsoKyjaeSi+3rL_KMaRX)JK0D%RH z10+_)+HQV#aXg;^(!hX=h~DB0OK6jDmc&YIm{f658AmY6*Z|?wU2`*)u&dXekNL*M z9rx{-6*mF2+77L8F%Je%fdaGK{z2+0z#k`M`a{#T$YAa%PdvPsGl+2O+n^QiQD`G_ zP>rn!S^vqlGY{Ya#`a!u5ay8Y()qDcCe5xBOavg(UuYW7>jeC*tu%4O9Qq`=n z$NF>97Z%$38clN=jp5_B>SW6{dC;B}nf7f1RdI#{e}MdixGVNmjDVt+JCjgVzo&7{ zr0$Z-M39OwkljT-mZEXv+ZV&?dr?~HF={Od$BHe}<3B2JFuP&NnC)6*YSfZz_5>m|^B-Lg6=4(NNw`XuTn@G0Gr ztDUEE%yF zSMXapZE2qW2||(*S&UcR0s^rLX&w4wjYc2JxJ>B;%sc=hs=JSvBAo*(0M$)y?C~Qg zPbCC!pSbo1s3+n043Gv2t<>ZThiO&4`3G?td80~oI2J8_0@-SN9q*7Gi?`yD+GHUu zJX4~Zq`R2vg8rI`niclI$pGy0CdN>HZzrFK^U!d;E7}@8h?OE33w+rRB>1U(F|iUe zs0P5;zjyvXk2(IXU$PHV#u`qm08;0zD<6=Sa8VX;Bd`t1$hHn>H(aCRgE-}Vt;_!0 zl07mg*S5KUUz!z3_z|5@ht=d_bi0t=Hz)E@#X39Ut}f)Cc-Ttnfb$gZf3pNM8nCP- zs~Sdq>QG5~io^MkIzS$=wUlXK-CHWb^I>tn6<7ULz_G&BlbunZ6|i2rbRyRI?F_*@ zM2aqhhq`21U=*@^oc-Rj(w(hn6of61Z^SQ2$hY=@xNc_7%vR`}p>*=%5EU~bI8nDs zV3MZ&OMUmX<$qj`VL)zkKOR1AsR%;-!rhgr<@fosh#wEg z@ZRg8CRI(5>6PAbfICx|mkOXhk!XyPJSH_iDAr2$Z4Iu_O-`UNQDEGCuUJob!cL7> zA#l7z=_)byu8ZH~kYummA}U+#U=fQV>4I$%W(rC|MQIWEZcH84?-rrWbwxa6q1Fc> zkCIS>eyG^&n|UAf^Ii@nc3)_xJm#?dey2wP#$`Cza=7kBk$DiBE>DU*PH94vhsaa? z`PsyQu^Vsm(vhKo`3QpSr#qEN5JdEa<8~QLInN7%S1of$eTI!K)9FK$QPm{19JO-F zJD4{$^%_1XvYSwuh+hAS_ztw1IJ`eJca_`}R3ck#L{t+yR6z@DO`wgHJAD>0ry$c& z*Jd_)9m|Q#YUZ;{HiL9R!djkTr+le54EF zJTQ?BfYQlMJcYQzV29~w#>DMX)dUzYP~NQTp;BUef*4|otwlgk^W)N4W66aq zev_92lI!e%MN&MNTNPAC(dD#dnx%)rWac7-kl%r0E;t?>&LeW@d_eyiFlgEU28^}; zVZg{-_y06tEN_ASHDEkE_;Y+3Fq+Xn4H!%Z(Vqs4vcLF_&jWdH(#ado-aiJ6HMQFh21V`iR|(UEbW- zfW@!whyYdfDpk6h01|S7q@MYry#2v?EWDzi!=? z1gX6bA%pUZI)qs-2O04WE1d1Q?^nfkNZ^ zrZCcUKXKbB*~uZYVf50{x|5Uc6aSy`=Lr_yX#JXWMoAZH+4%*R5V~j~q2^^!kO2ma zPY@^rjEkQLV8GxGI)*lrQ2>@sb3+oq6B!8Gz{!=}(?>#DlJAeWsJBF9{D?ovP`;P)cZRb;gu~?2KR6Jx+U%Q&_(9B^$ zdKW(rE}lp^46U+V8F-kRnajA`Qrnmpc~Uxt)~l~_!k*ZeNq;e+MJh;9Npz%++fRjd z-!2LufX7N5Z$HZ2VcL&gEE0v;TRj7-a?2Jbw74v~!oNH$M@6z!DStGeFagjWSb>yb z^x@q;))P$HLoq*wFbj2lA(Bew2|p~icR7a$cnMB>fYoeNSU+aZ5Q9_a_k|KN-w0W>zo zb)1|&`YjHcjw#$z9SPa$Tl57`nLuI?U1r1pU^kK>??^}{B|{5yq{zw6^Ul*A=| zN!0tQ=&yxp&CHP7))kfc;Cii09Nnw6x?(V~w2P(hEJb=bVyF4n2s(a)zuD8Ky0zQ< z%t+-fH8f`3`}TuEW4@k%#Zar692LDwdZ_7j;3Q|E6S%$Noz;@0E~suq#YI;lq)M&& zlA^(y8yZ{EFoTq(&R=zY=*ucH)Fy^$=1^?`AqAFX(7~OshoM21Y$KC@JyC?E8^NI2 z(V{?lLLJTYCUR4h^(~gWrSXQjGsM^MO(q9kj*JCL)CvW1Gs|YH-(Q+4%~=`=35oaD zEoD0xD*aPxERuL?kZ0d|!#HhzV+aq%mis8o$i_@djRPhwnE$q+WS_V0$6qf?25-?Ii#1{rxUJs19r|kgkgw(za#EYC5zR zxQv&f{{3klDSO@CPXDgZe|*?J47_MmlCTKAw%Bl{ju*M9PDEDsfq z$bI96iDpA@y9rOLGkrn4HvGtno=7c=%-0=~)+xn5YZYJEy$icr(*>aE5yj~2S$=H^ zD5$M9rh9Hy8N?U_mV|j$%$V)i?p9D}{|uf_v)$WPq$eM=O6u&~F{jB+BIMjyT%lb2 zy2&6sBjt%eX5~4cdt3gq;y#PbwVqkEh9`_Pd~o#u!T_CYG?{sG!<9V7rgsh(Ymo8K{AWdUEgJg)ZL&(OYTo zkd#Bx3h{Uu?xp#En0pz#exy#;De z*Mt!EaZ*~cibE;3m^mE5B`O6Oy9SG=LPGf{MzqsrxZLMXr#Xa&S~gVPMnko^@5C;UHRsW??6Y~iuzKfetQ8IspWJ+j{BZcK zuY}5fG9;zH%MM9t451&mo_wF4a$LX75-M?v1eTW0Qhyd#qBDetkRl=*m5a7wFsc6g zXhJwPX4S+2J+x_+6k)PjIGO*}o6ud2;29=#gjup5O`^z!3v$6hhpa^l!?=@JV`Ih8EH z6Oq5(AYN6gj$qal(gsn+X&l^|?;942BhvUMvrSeudsA842_c4GdF$)dy)?aK+!L?M zC-N;`TWu<88$0q?&D!Y=VDIL>=~+|52G~VttG^ZuUEg5=9x1**UpaB_WA0hO7um9m z_BBw8xV$-3QqpBN3SGNd)z+8siPW(T<{`_`vLi`NLTJ;pi@#o5uyeL^w`1iwgPR5g zBu3grM?*3WrRjtB#fRygi=<`d64Qb(a&Yi*addynIB4&dd6i#LDh;;>8NbldY*Td6 z#ZWtqCs<>0RE4%ui-}LAZ;%7LV%)H;lB7_Y-oNWQ@%kd_Y|UqcUJLj!60a=zM8NxR z=uc4PVYgg3QHeu0>)D*7i>hSK8XT;sr8aO((_tO3H2hRu6S=p>G|o`lB#X)|EF*VK zDh^*fJjj}ucPK5q0R}OJatk`{Jv7%YN-MWc!VBGHZC4K3u3G3+y#|L7dk>1+-8R#z zP^y&3D@5IhV1laaQf^hcj#LGuQKsU~HsGBM<>)`rQ7N@% z79?#%boM77nMvKp`*UUm{y>RN=b=`-W(7l|&}oaN>mbl*RPkeEGu=|KcHRyl?VbRQ zg8l$0hu2k#2NOM(=&U!=`W#TAWMUKyeVGTLxN)8|WqHZKHVeo~2q}_4zv7!ndG%GSKW}lweS*F9(XlIF8<*Y*ZzG`|jiAdgbkN>dDoMeeZx^oL>J7l5NzB2GP^nx zt~1xXY=MnLQfY>q_*DURQ#335ce7+LfRgkh73>hBN_pnAcJR;#Shl-y!#y%rePfls z3y+y*ZCL5essl?Msn?-fFU{5%7c7@jRHa;%P}QvBfv)fF`dq27o6pV?sutEYHl=|J zo2kG#T5IMf*+`OnUGm$5j+&$+UA+1tpWVA6D?LQ1X}U`dowFuml|XSR40t(?F7-{S zp7>oBgtlT@W8@-4CWu}(<0%~za4y(^Ief)O-KhCiAm~}%g%Z9P^wLBB45oZO-UF!g zfPZgY%FkX|S{AS?k4MGsVtpcy>~d&ZFloOP%1^?a;G6$^`PV#S9`R%=7u62z%stUt zHrH2Ft?;66Ue|vYKC$6_JY6y{_9lvV?rVvyT1#im5fGTCKT8A@21`F7tLdgeww=O? z4cjKDbao(1`;qEh+K}x)iVY9r2nxHCKL*rP8Fi!S+qt2{tvsDaho~mXK`hL)MRT2f zTeAJJe1WqLDynaX;1c~qbngl04$-zRin;dDg85W8KqZ;WSVbQ#n+6v{T=?vY!pvd~ zCt)R2Nr`Ui6|c(fE3L8#EAY(#=I0co>3A?;E0VK%?|!foRHX1=4+m#(V{Tlp@tRj( z|7ZBj*9f-`NQ&A*Wd%!sy__%vux)GvTN0RWNiQY&BY}u#WIM}+>st@)q^K2?WBU|I zbcm5J^)JB;K5kWEx(%iL7M=y&6;M&WpT+FwphjRJ{$t`TMZ#3kyzBR}o@`yKIXDAS z<|L<)9%MK?3UHUrJy_UDvmG565S{bnd#OO-8_bY8(|136t)@=;)6M+1k zor8=cBdr-|1p+xu8f=Y&QejC;&ro6v3<)H_V&D&p63R2GW4YXuabVt?!ibadF$G-> z=`@{X7u&p)0@a!Z>d;)IHY_bT2-C#}_f^Y_4~3QTApNMfkGwq$7_rkHRZ5lbV08$i zsnDF}&GKR4Dgw~?Kyh62Ye_5VPyv(nBn<>;j9!e+%@i-nKbTi89rjczZOQsS)QlPy zceEodu<2HV2B+xXp)GrjzW(1$!G9K?Qhi>;|2xJk7Y~RrBO#TQ?&Tx=T}pOJU}bet z?Xt*liTGCVcPH7>(7gFhIKm#pLbjs`w<+!W42Co~RZ+pg|T(0W89{Ch$|- zpIRa^(=7FLaOWn*?T}7?%L--#!Sen212zT>z_T-QLyfJ4N?Vn++5Q@FzUNkO@0_c> zJOrEfWo1r&lUcRbI15lIaoxSu>TK__#d~_eIiAr8z?8gvQQeK`V-d?sL$ug78E$9; z8Z=jG^M;D2L@rP~i$k*5&))pGuj4Bo1T~zCmj-Wp;Nu7w=?jgQ1VeVa(|=%j6cQT2 z1RI>27t>Kb@m#L=(oy!){CAF-6d85D_z{C}1X&ViReg1m#dC+~?BF2FWtBj8O4RTN zID<7q^nRZ=f1*kCq;0fNT!dz-tRFj}G7VIwMbtpy(<3svUM5Ue&$ zGL%Hx3f_V#34KWBCX9kS7Ds=>s0+&{aWL*gbhwrpuvQCH+R>T8*v{-@g;DMQ6|ud1 z$12w~4ZagpG=S7UE!n3V{SCX>8E46bXZ1L-b=>JU5VSgUJXG@1qT29Dq45Ej9i@_# zrD3r`GEvlsp2kmTIRUw4QcJak0j!SJdUr7d*X~4gvl$fq4I8 zzWP2_4cHNX?R(}4@LnB0|EzJn5QJ6w^Ofd6E7ir?q6&&$CVt?r(Ru9C=q!7w-V&3c z1G323!L^M7Fgkm$LaKxQtY>hP6j$qA9#IsLlfCM-4Qh4j>)>;v9hgCz&S**67HKb4$zrnaun3O&gDf8sN4ye?dT*aXF7qM?jZr zoP0&A*89R5=fMj4X>?9|nOITc-ibu|Rk)-xKG%zo5}6r5QhiT+x5)9tlrJFEF!{Cq z&ILApa7P1fEI^z8K%#pSq3kAoHnBXFPr9t*;H^+6V<6>J)DEtPD~tpK_ux%!YiB2EZ1COJ+jHBB5Pt2U0b zr3Kej)B!V%u-uQ`vw}G*9?g6WqYC5wJ z^qgd{0hAPP9jHkeG^QdJ!Zv7aK4%ZSGn~Zrk9w(g`Pif!mP++DWS3{{bdYIp-Mphh zT0FV(nsa5oJuaT^h?&%c#T^S_#JJoFAqdp+ZI0H1MG zs7#5Aoy1x7$!8$d^fg;4=EB#Rt`nW^#h55F$%nW0UIEaIz1QoP-#OD?R7nX>gek3i zlGsZ?BvdPsK4Dci!t-8_v4PuPmys?!InCf95czyUpQ-x2+s3}`O_3sn>tZW@!; z=FF$ha6o1*bEl<};sOM^4?m*SKLCA)*b z_ztcc42)R~QuHm1p=@KwA*J_7H-xKvX8)Z|%^(!D70NP}be(kOkR6u7Scqho&PlY6 z<{f3fxAaxZ`)*o)w;cDE!gzLE$G*`b$@(tv&Z!3)8 zz$uXK`bd^k2|PY~IB|C3=$NL($Hg)T_;@nDOEt>N*eAs&*+GV}#?ye3r9wy&rb1I( zqr?|Yh!xZv5-V!gV{khpeT6Xk$&7JvVB%XPuR(_cU{+0GQK{A`T zGp5T>9iVC1HTZ1^^)I|{Jq_zD){J1bTQx@6>|qus*{>G!8}+48HRwfMOf*&plm4FR z%ttT$p4(j0+-N1GmWkZ7FG9qPAwo1*f#A~^rV)jrlI7%WntUlDGN5d)G@VG=J~Oh;=8RH|>}nphL8D!!N^}F#&grqi#^U(dr)3R8`tfHN zwoBcneCSIm>x&Y-l)o#6I$0B@3b#((gO6K>q_Ll~^j}?q)eZBj$;D(3&L2)ujTMR0>fSL0mMWjQv2KJ+$+O(XV)N zY~uCmVN5bCHr1^gXNw-v~w_Du?1k9TnBCPy`(inzSTr##!~`3Ty0dwhZ+b zo4>P@ME6xw_te^0Mddm?J~vbgB(I!xSl#BC(Leq*RivJ$i_iAS7fhhQmOf{Y^Z==& zD8>SA=KOzalJ0L!{3Y6<}NzC zy|t5q(8;;apNOIdN%kMo#U_vy(D3QEFT;Qb&2bNk3sVi%tNkD-ta_V(_0OR<2257m z5do8xo^`$@yi@H*Prn}sk6bx|&z|^zU&dY;XtDf;V`AQr@0d18-SFl6X66Etq#7~$ zX;+vnTH?sF!4ejSrk1z@eT@@WpQI-}P>0;J3)FKo+2=)_Q$=p-Pxe`u*ch`as(X5v z8$=c9V5>Wl>7eF2E7W!;CC(ZBvw&D|c4OO48r!zj z*tTt>QNu=!ZQE?pyVIxVz4)B_Kde2!*)wati$jzUZNPy#m&=YIH51-5_)ugkqvG-K za{KZ)47=i~vN1M3ak9-gVAIRIm=iCxUa zQY?&h?UXerPYTvmn>Y--)bF-ns+HstnT*~QfMGQHYqJQQ^2Atw_n~$G8iiee$nLd* zKNU&)qT9fWuJ6JPJeEbUOJ}*kmr+(l~@g%)$U7Q5S7)oOvcw*-rd z1TzG_Ofgzr{Cl)z%4KqiP~^~H5pKMWY)qjg`dXQDN@W0)s;Xmy z&5q#H>pumOb``WF; zPnz~`*C<29Q-4SA4~x#-s*Hw-AlNYvsYhE*ZDP%lc3G4qabq6YCq*1hg0uG&I7t z8O13KL_>RPtO(eE-eBXfy4$@TL?;LNPq=^k~4$`84 zA@vYIOH2IGKcB!|OD7la;-*KNOKIY_xuQ zSP-`SgyDuu>XE$pElHb{rfLh-gg&5K@uT)GOwB+`&7ZBW9de-vudu32Mgkk3?dTATUA>}BWp;SBvZCDbNBOwr#BlcN4U_{IyHqh5qO2JZ}bq-_`Wgn zg5A6PK1W!D$prn`>Wh-;2iyt}J#l@~ttYLhqrK|7LjDXlgNoG0;~tEUhNSSJ7=Ve% zVPMc00M)mHEaqz>Q}kG4ax_B{L9-6WQ0gp|{Vyg8zerwoE2tKtHkeTTlJNZC4pb+y zOCz4F`9slGCisn0y2VQk29D3yE#DQQrd2|h(qwTm3bYp6q3r3j@@qQ^E1B6-8Sj(; zl1=Fr5YTA`OZu>SFy&hX6+*^j%Y%}k^lHT8O5KHNd0TUxg2DO--W47wTUGSs+{xU- zNY(?oGu&}yVw50Uy3q;y;nh=O%Hf9)Zrf+(u^2kv0L}R+%|Gh%;0FW@1{{`Y1F^nZMNjQ@SED;x+uviC^Ze5W&_jkqN}(BTo4GXrQc zS_N#iA`$tyWp%7jqyWBsg%h~7sS`U!<%*|KfgwDg5ER^yv%xx zt!#AFo&IV)D6?kagdcl2C<$sk;B%wS)dPCX6T{)n`r{*oB>xY+qa%tw45k>V!Xh*b zDs8YqV64Xn2|EWti|Ufm;&3PE0uCko0{G3LF^T;g2y7KjeU8 zn=wBeInxU={2sGV-4$zfny=P8i_)d@&5&{JVG9JR&t6PFNip=jC8yyJ(lxzdugMXb zgo7`7YMQ#z{JTV;ARX~5$9F#mE4s}ty&1sWYm#wy{y_D}tG3e*7O`_&_7=yzQXMbe z1XoF;kW34|x1{M;SCv6fJ$d!WtKaLwZ3Jg-Qz@eCf&gguj|3Pj&Dfz2UX^lEH~QUt zoA?{L<3=E59KDfPgbDl44&r zBuBebS=C3dxFj&ZvGM% z_*mH~1AYW-(fYjn;d{1peZa~0?V%E&LGc{JBei?V=b+?*ntIkQg;`8eH`Cf=kXu>} zA6r|jid#^hIZwUMe?6HTan$=<7`=dJ6FWjnY7IN7@6PvhiL;242})h)i-HrFfF`5i zr2__4+dV*KE8N21nIKKeQLYh|c6rMBMLnzzY^>SbUR=SYG@%Je)jGq#M`ekFZ9$Wh ztUk5&?3`0rv0}20l)^8wo5cY9Dl*Yi!h?Qyq_D&X0duxYYAYZ_n zxxDsclx6)oF0l1h=a3gNbFbjA!@ndxvZJ15xE5j z&19H(726-Soy6zVhNMys?}U}Yn0;lx1#VE(vRuyUE*r|Fl!)yYmGrol*_^CGZU6WI zKCU|V7Gvuxt|d%cQho?cXjFlvnW4%LuyL#&h8iIP>{zULD?hFmJW0&`_QO$wS|j(I zwKtZaSeFLfG1P3@4>AZS4cWVL6%${f13J6YfA1yJ-w#j@_}lOf`NKeY(w{xgLCH`^ zaTYX+7weaYYW^1-(rkYMY`o%&`3$<ci|IO=$PE8<4y3S^( z9Rpd$$Rr5d?tFc}-AUy2y9aJ}YE041b|K9eIC;X9dY~v?E&vyfoe=sg9(XZgf#X*T zS+9}&f`l0i=XHEMw|jK*ML#xug5YA&QX+>TdM(g!!+CeB52jqAu*1~e-&6N+0Ml$k-XFE|R@`=hK|AW?zu7f;`?`^eJW+M9~mJ;IO zxEJ`3Tg0)lxdMjTVmwMT$=&y(&@OennZC&e4_F;CBsFF&$FU>01cQNIOX4a}TJdMB z^CrDovWnNI(daqK+x6aYCpOF2D*IsPgeLfXlRA8ymo0Npm-(h#=%t7RXqRRg28}IM zdcc4~7c^>Td#*le@<@IUMC>dM25F7sItks;E_HygX`tIid)~Ufp z5Gp-m{TCp@`|Z$emTVp*XX^H;Mu{Z;S=fk9D4kOTW%vlnzI{LJ8m#%dv~kSmcIl{f zvGJrm-qNGl40Ivc5G+qIpG)+Q77IhWQ`uY*167&F1!mWfJnp!}faX{JX`cCg^))UF ztiE_vFHu%-JV0z=zzBa9=I4c~@+$<+LI;&cxr&D@uFsvP|L_4aP*uw4XVnflngjEi zz(}tNcd{-rmY$w|zs}W6@fSuK{Unh+)3@EvgyZ;1gO)uV3ynN}30GM%mgvj%+~eBG zdjZSN-iP@KD&);Nh=S1BEzpR{Umy}$ZiVIA3Fq{M&=%L$hT%C_k)u|Y#G`fQ;q~mT z_kLot40$fJhLD=MtdBN<%5xX%yL^#eYkqnA+C?CxvxSwKNs}dvI1z6CJzA2J-k5I%a zuwn`r^I7v#eO<4}qfMQV?sO zK{Vmt;Dp)KRR-K*t;QEvl12r;2TQ?Mb-%)!XB-DrtP+C;K;%)sFlQ3aIYxM+P;yHa zQJ!m9945HQio{{2+SVbW^zHr;Vk~CTbN<@CiN=kJ=t80dEs72sHPLhO0$c4=)scE4 zi$LqmJ6XPYLKN>C+^Il@SdCoa+>oOEGXp}`pDG}kq*{NEL=$Tu*~>v91W~)ZRAaow zr5KGE3z3M!i7b$v0{&s4#nRGREw`KXne)&?e_3Px=+ZkUhEPD0Xe|EMQ>exRmf_G5mI@A?3d58Rpj*3a_fZewljBXKgCiyFI+Ok6D)Ovj2Op1v39n z_AKwR|8H@SIjZKyb81WqC(@{cl1fm;#wA%2@IzEZ#>Rm%dy^| zS!Y{fyhW-NR-zoTx)G|`{!`>k4(H^ zl0*#5(KdcSu7hFE+s71KWIXn^oqPiFVnmtv4W z$RUhQCW7*XT!8D|*v!zc*cktymBp!IKr%5PD5%LE z$P3Qx{M`Um%id;!D3{fEy0>-%c2kz&$jZk!o=vDa=rgx;sVnv8A_uQ+D`=$DI8c>I z4}6aT*>3hDH*{Q%R20Hx&$gy_mOs|x(-S~664sqIV~y`C?A(^lWwWfeGFFh8cy@F=`alXaJfYOd8zPZIf`qw{SN3 z7Yw3BjED?b$50)Md#T9VJ&3=NVEub`(gz3$#DmtJcF@U1X%*}#LUEo~a+LY6brXt^ z%{N~x!}ZAmX!O~Me&;ErD0H2C>AsuvNLW8>QZE74ymcGk?fMA8g5lz!FleQdRs+Bu znb4P2iKxjHtjbXI5qca6Z#uY^b)V=!DdbiXI}8~GW~qzF)m4WZXXVkIO8H}*bCsmY z4?&YNPGR(*0zCLxEVSU!RqNx8#9UFy+S0f$Xn0L0g2KO*?MiA$qbK)P{taYrTy$xy%xSbPoksd0 zkcbKo8Gr_QuQ@4l3_Pg(L#1H4wYiSdbODL%CMvmn?AEAS5yHPDW7eT~LBTWWK9%Rd zuEUUZZRsuF(0iCI<7?|2oUi4+=C!O_oW|_0^UO91Yui<_z__?kSCigIR z5cc~!a6zPHrvyvm!}kSS$M4;dbKdvctlT7UmG0T8GIe{DpiZUI9HSMzf?WqL|o-SO?;3`+YXE0eT6q{_QWBn^d;lM@U`J?sLQAsG2e9pJ}hEq8T1k39euq6`c`GLx(a zZ1VYf4Gpbd3fTs@J3R3T;`e@fmh<;&&RA66Bkq2w-`<<5+H!4=D}=SGjiPvwbX_Ox zB*`D5be2&rcV-n3Ccg>SjhFOazFz)yon5!c(j+1lpZ|kIp%N>f-K9*Gm=qTpCUazy zQw~Bv9=yzUpqSs+b|$p8rAlYgAp;Hy1SCy?k|cA#*mX+yWPzr&2Iw!;4WZU4IUzxj zq&9JVTRC!YV&p=SlqG159c)C=qXY@iQL6%yg+cQvlp5HZtT7;7C5?wHMT}a0ZKGvd zbkJ=C?5Q!lftN6t3;yrD1gOD3y|TamHm=$HA9|OKYD6KiFGGC zVUb2M__7^1+j^MXD)@5sQN{>3E;#5iHa?y(He96EZ>}(~ozVy_e<@g^L<3ZelI@=1 z%iBE(fG-pZ36>M)e=s3rFj&xgY3yq^i;qVyCbo4*!o&MSod0z)4u22#;dB4a)tmuu z*S;wCR?~RDfxFK^n+k!}O1988(XyX-+08E+W}BX8FquEq+FO|~(ko{HD{@5pllwk6 zZ-Oe1rk+OANhpkNd2=`YN`-m>zw|S|jAz-oG<-7cmK~T|>4M6xK3aN9hQgZoq3ITz zVqn}0eV*LuZ}67h>x}mi55yz4b5IT|1xJwD&@%vjC0D8iYV|LM}7ts z_3O1JbUAK+<*F}G^kTt)Il8E!N52XopF)R@P#eHekPWI{?VMF<<{1^P@3yi#z+fu| z$V9;@oeUg_x$~yL^mXSj)OBXbh||4AdQvGuJJ(O|h7)#}6~p?=(s6gU*WlO6hD?lc z?es&MS9RO{pd}|A+E1qL+V}{DcVge~%rDv zKD^V3BDN(rs7q<=wcG@B1&DSNQr8`*^BPIejs-vlb*zFZ>FN@|Dmy~0O!+@6+9z7~ zTf3EDc0;9^O}6mw>#@#Y#`3oHc#V1=kB}G(nr`!gmtBjFL^s?kQQHnsz2bi*Z;nnI z{T#CZL$W_Vaa04=w8IhAUK;*{3M0xLs?aFgx9SvNm1f@&BrpQ)SYZwF{WJI=3vCLP z%SsQmyvS5~Be%MOAe?0>Ycgem$V{H}kc`~ZfVC5w$Yk8+*h=CbibYJ>3sEW05La^# zy+=L>-35=8iDh6i>g4X|Ysb{Ns$>@e<8z#Xr0Mp9gbtr_}$Sen%^KMNd|YOH(#gYW;c9p?rv8($12FR!mc zcl`MB-SIZ-Z*WiMBdwlVThH8g-KiCrs6M?e zZ^@E{slb1Q1%@&hAHA_;pqBgx$nSw)NtL4k zJ|MFo`c93lcOqsAWhl#!XG>n5^Q5K8Z0Ofa#+I4gmsU*s6PO$mI3)Tl=JV1^I}#XH z%bHl%9&MAJ4yC~ilVqvd`36ZN5mH-QLmWSuNG2R+{yG<#k3+W~_t0L}5%3tqNo(~1 zXayG7pSLq=AnxO#i*bKs@6V?$P~yX{;MYqw28KzVK9`-q%Mi6?xhCHCq)lJX^it&> zMxSQpAgM#Fc484EZeib^AepzDgw+JzY`pJnSZ8nXGnzysO(I{NRlMd@n>JLE3+0eI zp_e)PHYU1);BI^oIe$*M%9kpO1@Pq<;`flgpEmo2F7BM#{1-P3CDkF4PrKd6 z2c*~abLPNrZ2bO&ijY^@B56<^dKUan`T1F29oth!^!ffnJM}D%PPD8-_lnS2X%jl# z2PJuzCUhp~)mqF3Eq@J!26HtF*>)y42%j*Eg{m);K7!%zfl8Qnp5Hf)V7Xoku>Vk8 z!Uq-v)(}2>{)hPyC-^nDYv2W|Xl4?m3HfwyWXx}mvk)C4U|@`dtsGkVuE7N}aCAMU zykul~_{g5WIR(-sjKqt7!*(edgquXk^lGcVOuSffKSjm+Ass~aoVhZG_2;_))D)xg zZ4iqZ!jSyh;frUjQ^^J~#`-`I;4Z`KcJH^9AX@Au+|E2+rXO(pces5i-Kxn$En@3x z|+*Tkot{Kjw6gQ2Y!XgMr>ep zfz1?RNH=o_1=QOWU1Kt(o(=>!uJ1@U$lQ@B*K50o(hr<#N21W+mXNLo+VW+3m5e5> zBhr-z(N(;jS1A;}a%J0L;wI`$|CR3)12SCTO&2nH;Z6Ff-NL(y4Vr?;me|E>A1B@_ zQs8sQ7%hb?!}`NhgrvkOjfqF#V(*x|wL@?&Qs)t&*rK#4E7+q+vQ(VJ96*|`+Kcw8 z=jZ2}6e7*G*oDE%vY#Nk5Hyaa^AspM4V^eH0d=;mp&V|w|G{y2dOX%h1}I`N`$aje zXWEd8q)53iD-g5LS1jRZWkgX~k0kFfVTYKmgvo`J1M;Dg0?AQ@Ea?fEt4@VakDd7{ zK&UhV`nW~4ytM`7MT_k6wI<=R?0TuOYg1aLoaKWp?(4@0(}tt}y>|gg`6p+tcRBxh zH`sV%vwY3l)3=&Zx39161q?RkdH=^?)AFs921>#v-si`6#~j9`dghu8!XlsgO3#Tl zu7~62gN(0r5LC|LY?$aKt2!DLM3D<8&IG8R@{hv51V9z)ZUdopZV0#b>_<31s0k>| z9$XL_P=bsf#%M|P5ltfij~?hi!7qlvsI!TW4sw}DK(2u_X`m0r{d2+k<{=kPc;5~M zj!(>0j>NV-?nSJ~>n?#Y!>^=zAW_vFqE3d%1w$9%FuR+OaCnyqlU-s&Qs z)IcAjzxel6ZLG@{lP{3BD4A56M63#i@5>|PV@{zWju}BP`cHpuo&u9u zir45J-)NKFS zc)J^HpvX$7;~EECzk3ihFG)#j4w~q*DT;e-$4$F>OnV9!aWiU(^4f!X(F5yhAdi6I z?x~(gQ53baAjcbo(7&Mv}H%VQsXYl{4IhIu{u3-Rr7MPHFUqCPFP(vgA4f&1c$}=B*W317o;8XpU`BC zlK1lKvAgJR`DW8~`%h4bI5nE70@UYgxn@CGFXKDbx(72OqWR`%xV6a#J2aAB;r{}H&BOlle-%0G`0RImqtK~R^=HdX-AikQe77#(%gG)A)&-yGl?Y}% zN_DT52odP_o?2*Jm}G#6LH4R=_|OeyX5Ps*W)rz&EbxYgeZ&qS1Tt_ZaO;o>M`TL@ z1<&W(QH?wB*n{%1_aw;O+>C%@oCPhoASd-dSi-9^E`kHJsC~%}c8@)0%OV&y*aVn!nz_1{+bsad`S2jf_{o}OyBmS zAtzGv=9zJ({dWlGLb;xg5%EqRq~nN}rXBJoegO{QJea<;@w8<_A5^KM;3+bND#nia znMy;Ya1)_OO#bL&VnUVLN+X#Ywi$-}=|BPM;+U}Myxcw;0-$hbPUpG-YDgDFX`CuCMiBazQZM`n;5f5h?j70I@XBl_g5`4m^TKH zIJH1tP4t!+EWO69@2YSMcnm^KSC8#kl8oof%iR;6T1q9q0aw1i~g&W?iNv`v_f zs4x38UGq%vURH2gXEz7AS}K@!JfUWsQqirq_phHHwn(SqHQES~vkron**;!lfxg_! z>}Yt-`V>j{I|BxrrV10#s&8mUMQdFns_5a-(OAH60)0k@T*8CH;x4uRW!ATTmz?oJ z@%>WB;-4US{$Cv^kX}oT#%_$P^nO)OM`mr9|Bg1f&Ybz%f$A)O0kaJP7=Qi=t&|@T z%SsjK3|?L)IC=TLe(!1(2Q=wePgm$r@Yd&T2`U z&CLTnhlq#s8(+!o1LyL)CRuwLR*uGZLH}wJ$gYiO`s=010Sq+ghqFbEdGX6Pgw65T zmqpD#DHq9mxAayZhfGvs6z%qdRsSBv6;(;1ZDQ_ctAGzG`i!{8-%Mdir~^lFAOF*$ zs_MKv4+NC6fjQ0qFq-(r=apz{@o2{oc2sgwLHU)xH~O7!g7Z!{IUc9Rb-4OU82?)L zYk6<+nUd_CZUW5_Cp|6(Xb~V``Z(}@$KH@*pAZtbroJ-MaH`H)kWG<>A`k>#i_5vH z;p2mti$|N^iYbybn23c=WL6;y>Vuf>WYx_>&;r?c%3EVArcIQ>x0kV7jaH!0b@rcO zKChv!iPgq;kuA6>>|H>0Z%oYuZE;aP7BrX$CS zDEmFUg4&7n^r@)&Y8tJRPVSHVD4mCM7ixy<{)pL4@)%79Tjun<`$gC${dgMWfrftlFNSEy{+(VkFWeY~?9Qm}!oQO1dM|?w6;A`V z#Q!5X_`hXvIotoAcsAdGA+o1HmWy`m(%mp|}K@#1sx|0HL^#ofl+mS&q7 z8nA1+JyCB~{C`Uh{c=;)=bWc=x3~xR67vHWw&^6MgUI?Z!j?d+iePwa;3-F&;UN%%=<~R+R|5VUc zc`WWYp0RSZPV3A$r!KWezFxhVG)&HMghhYmzOZ+MA6WmnM%Q-Wykn-&IFN1n!#~ZD zi8EnEV=;TyrcA$L412U%z=l~k1=#|k6M&_o{@+_gP_TbO?qZCgg zm|3nS%1lwvV$l*5T1z`2{hi8McNI4qcSx=LHM! zmwO9>Xk|i7lQT45X{vMN%4Ly2k~5DTyR2*dctXjXpA*eDUA&OnG;uD=0fFY<>&u-N z#?Qrw(yA;CR)^uXEO~y!ajE4aF7?vE4d$!=1Au53x6fET`gC2juk*BD#smy1*d6+Q zK04z)CSDIXvE0nMdDD1cHZu-ZGC?f->FNk6THkux4w;V0E>gf&2%c0)!zW730h&U= zd=%J*h=W)4YPNA}h46VIQ{KVR^c+M;b95I4r+=}5r8jl-0ui1lJ4rOQioGHhM#_@2 zNqB}CxhA(2ma!59fQo!IJW%~@NbKGgz>W=@$z1LGPW{1-lZI~0D!ZJfO5*awLl*!!YLru;j zi4g&V=V))x<-k;_QSXqN8gAvQHgCfZSO#gWB3cGUZi14-=2-AA2Ua9g86sXIWRSZY z1~Z=Er&e~bt16#28A*<)K*7J5S_!LZs^>$1kX3&g^W9n$y?9+Y7czsfO;%@$*ZJ_o zzq3q(o^Xmm)2>DU0HWs~975vGh}s($$x-RtqM{RbfM4iWqzp;I zC{+u>#L%lX=b^x%c}?r+GWYcqgDT>nbL9g4l$C>z!sU`m(_&64N6m63Y{GV4Q0@BG z+Gw5~d<)aQR2nO}{I`Wpqhxu3zQCk-8o#+@mAScN?INxuag=B)L!@Xp`r36AZbe~5 zp^$$wGqW36KAY_KK1ugC6^(XNI~$?N4)vwURl;ydr?vQd&$VYCgEid;yPCP_7nvHA zsE&|nj)}!mE_^DwQg9YxV9%Ua&8utf1TNF9FkJa9EfOOtYpw8+iKPC}!@}9cUM1c1 zsPHGRGnem2J4T#?46&y3&))%W?ThVm3ZL;%}!f7QO0NrMe|^Htd=yQ4<59_rWV32iafW=AGZX z0e`J+4;@PF`l|s~YT$d0CNX;(?ia18fajv_(A?7Ik5sHWC4TYBH-y^fD5t8&;~=-_ z6PtSHf!0b}(+BS{vu}f5)QirNaWFlKw_jvHBJ;d1Q%~w>SXWPdx?x6)vJ!AK1IpxI z$|!gt0G}<_nTaR)i`AnVSF>rETgjXScenI8niVW?%I|+fh(R!Fw@~Q2J~D<*&lGn` zzv9J)SL}nHWzcp<$hde{rmmAi@uzU%^=(FP@z2yBC%$Z%p#sjdDb%R84-TU1jT0O3 zySUW=oL}GL00uJ+Dn+ixiZttXJS3q`d>cM_EHO%*hA94o62mdQi^G=Du+T(K-|o}T zc*8e|&t|w0culK@4hxlq`5-U|7u0&${hOD#V}B?ZiAPRSb&B)qaKK$c*u=2I>V(<(nIC`CawQliQ^d?ru={6vH@b^A|h_X9`}2OLj$4 zRPhm+xbsAqX{VaKx8Qi+KX>H1ej?H%UB%Kij0wZBHLXRwF_fO$&Z*3rd@mkf0yjXj z;mnl2(U(Llw!`JQ4yCjwQ?8S+_~Cw6g}Wb$(&$3jl{gs)5TX~BBZB|x5~l+{rllv7 zLnByd$|Mh*$cT)R4}5-kCN@KmXnfS@vN;XBG*e^{{^>@Y)7Q1Zm?bubb^9|W3)e}m z*z?mGugw`2&*rgAJh zRwo%^;=p(dxcN6@+huGlF(0RJr2x^P;!4P?wQa=o60(Q_8AGIc47QTpkYm!hTU3_S zLkxrOmh>e?i7XXr1iF_~FX4}9V)D3R+rA?SU-H6P%koUHhgx(NHX@e~vjjCB5`grdvtF-X%yVxa?ARBOE?h54{Oxfgn{= zwk*3oa;r;?zu%iv*|}aD5J8m#fjKH?Ds~G%pAuvNwYZ#dQ+X+EG-}JbHx_6BHDU2; zYTnVZCGjc7;;lv?vID1)%xZ6=L27tRTR*>{m*oT|NU>_?47~&`mMR-(`#=H ztIcS+%sUetIo%90KPeobxh%U@sSL3-H#B;FTK{w<-U%|HHlinh8*xnxuv{MYvrjT? z>8%R=#bvr=`hqcCROibP@@Rf~mGSGzsGh<_(%&I<@pTA$cZYIaZ3#4jNDFk71coUf zB(E{GB&G&u(s~(#T>+b3azqO&z*Bf0PSLDKBdff60WQ$ux_kz2(o1lf6*BZXiTK=g zv5N~}=cov-LXYMSi7CuelRU8Ze?~-s(=yn0@iUlt>dFah zFoI0Z`?%Op>DY?YlNY~;rwAU1N~XB~~@RUt~(Yu($R zJP;d69d4W~lx;U$rw+#*?z(-rcX#;P%EiTdB6-Z8MWnzL#bIIVh{BH3E&JxUdPgu+8>8G4B# zenU1O!Bh%k0Xd2@gko|{SULVVvXhZIEvm3qR?o~Cx&{LmPWG5!JZ}N*@IwCy#bvH- zoTkEQY7W>|4XAt#Yo;+XfoKO0k|%xkQD(W94|)fXfN0`5X~x8OpE* zrEasNpZa_wmL?MGW#86)J=?w!$70iZ^aFnpt(S_`u)4u(jl`(;N3<{b!7h%>yiOH^ z!sDjaPc>1#ABmB}9sl<(59Ivc%O>zH*}p-&UsOlbtT=B`Ix)f%)kDJL@f1nTm5G`p zQSyvk{h*p%n>ye&V-E3EUNiAtKI<4tXEi2^2XfS|yW57a+YWB%zQVBDm4RyvC>VG| z0a>THPHoyAiUNHsIXj>ja>tmjEF1;aH6qxr{egCH(ta+$^(jiMdS*ViWQafzvRqn_ zClz!%IboGHJ|4mY6nKLkO@vDK+V!i+N$b+N?#o&a9~@U!M(kUg>n+{7A1>wQYsIg< zOJ)l5DakIl>z&Xf@T}U>Yv3Y%hRsDnMqG+w$>`k=2<1X~X8`v6k@Gz%rK!vbiO>|y zxc=Q4C8TL?1e45prm_BYVWe);e>0dFxIN7g2v=WT6r^TIIyOT-t|knKs-1oKt#dlM zS%C9%f~<4R;_3Ee;3;g9&_qT$^9vFjoG}AKM(9EP6kv>se>Ln%LY#QYLmtSFBB198 zQ)fD1Rh~Tzt+5R)2|VdNgPmEb!QWU@Gole7NbAAWMAIirp`olE2GhAI3?8_T{81Sd z>?VW8RHrE_9>mhzszND&p}?hO+n+X!bA=vcMwk>i2C|jiNlzIE1H_}I*!j9HxXv`v zww7jD>P>udp8L8Dl9esCODr`^fXMaI|JN(kTd&-|9R@>a=<&*jLQD>d_GjU>m)N0~ zV;&`eK9UQ|G^={yT1p$Q{cKxvE+8>v%st`Cvw3DJ7`>-#ma)NEzF3(vS^1m!f6?f~ z=X!03QH}=2o{C#==?aAIoM{(h+H|24gweUC&d|2~D>afQYF0LyzEQcmb&e=P0k)>J;kH9E4tR0T zL2lysF*j`il4=MVKkTpKZ+D|AXm(R)i8S1+#P11_ul=YV+JkL{7zf9dj$by~TegA> zi1hZ#+MdPG+O_6hgdUmQ{^)?{2o^y24*y;3@uIE3 z^$FrsPyoqSBU7$BAU;H+tVJL3zd`uP70mk}Ttp@an}UkEJA|4CvLpG~=i{?K>7~WF z+C%X*sVP*QbogtAdp5?_JOOj<4|BQjr*?_;X_7*Ty~x_|2WIi42v1-Ouhh10T+R&v zS5(V!8K`qUSWC`2-_PVdbpiRfvg8$47>gJKWeOkXCxnTXe!Zows@Yj8(&$^=%{a@+ z=aBzC2!{>L44Z2$aifo3*OfC(_&IVHiCMA4)WWZ@gUUkbMoYrB;FrqpG?AR1i>bvZ z#X&{fC6dE3lFP6(E^}Hfpi@SbZbo1epkq8aT(I+J2VTq*x}f>o*%wY z;So#0F2q<*ulj-1ik3~y<;c5q3-`Q#UQbG1&exar082|5vjKOVBLi@1^lEEd%REYPC&q_jCiChZv4qh(W$2SngQ^FY(Z8;)&d-*)t170tJ1$8Nms8K^ zPFyap5ONl8lR;AUo9!Y_R8X%y!zreGq#_T1@6dVLdTr)u07Pdl5>_m@>=K-Uyh0WN zp#oyQP?aCHGgG;Rr^*>EnwJI8D{^x6b(2}-llx$$9L>C5ZtHBT>gQw|ctKS6#LCEV zrSl?T`82H1 z7m>du2vVZ}Xs~371ZZh$q!|RkYHxF%%*A?K%}2zS@rIjU;LYM9yOiwT4>DIAYixae z)k&bkt0+N!VoT&ED3#@on1O#j&XhPAp zO8FNExTHD?=zN`X)dqB|j2$X(j1a_U`|s-UFb<~;ebHh9J^3sV&Uf~N7Jt!2zBaXU zv$8%0GK^z(cHl!e=qDrm*8=HYooVAGYBL#}Gop1W=6)V$m2OHymVT*<{zK3)bc>Su zwiEMXdgm%}9@pO~G6S*eS5{_q14YZwNTF-b^m0x)eVSL{#nxf6X;pwQp8Nb(h=Hp( zR~s3<0bi)qfm5Lsw5Wi~)`7O!!7B140N;{*2BUZV`NF4OrpEt4aBuNepf2(3kyzqF z%MG?1iYvnu(h+u{c_)iDn8Njqm`dCel=np1$|Rp5SXGhu^_PLiUoso>Puff zIqxcU&KHlh=4=IoPd}#KRD0suH?D*p@*Pku~nF;^F5)^B>(X5NRQm-?8jp%ig$#Dcitca+B zzrCC3Xgl$x%lp|b(TqTGMlZo$Uo@+ziAt1W@PE7r$D`h63Mdo~(yk zEX_xysq~J|Ahoaivi7)>g*xEbme9B48FQR z0OxM)!8PFRi|aKUa8QjJdpmUN@EbXiWlXXwDq`npFaRx;-6qGsZ`q`}R67UZwvf&>vgv!D{L5~vPCb$<+;w_)=j`$H_WAZck~VdKr#1ccqd_242Ivw} zs3}n{GTRJCDdC=kZJ;fdsl3t2Ii2%S4fLa?FsMtQWrd4B{sNZjP%j2vmNKn;-FzMZ#wL?WQp{+(dTf|0PFTCNM*?asCo# zl3v`h(@f<iq^I&)IgM!3c(+>weF6LBYa z>|0P}`Wm6f0w=KpRN2rRhGMwNf^X3fT|V6_FVna%HP8LId_5NzvA;c|uftk4_p#J( z4Sg72p)@tl6PF;KuB>)?#Hp}a7~t?>q7BN z@LxdO8vuw4DP$Z5dk4gMedieINCju4PC!gNcm+{3ka!m-7i+%XwjqK%OY-gO8(c@- z?=&$N?HU&Vn4B_wV{L z{xfypbOdHDM=M*4<8vaG8^?9=WsqVM|G=`;HdbV=*gEHT?vxtn-R>baHd&LyhY(i! zP*s`bW7T-J%b#;GHF2_>?8}FlI%k_jTtFL$|-dYZcSd(h(H?}a-@jP zKP0M^aBpfIs!Pfep&-bnijO~bM$EJ`SlBf(e*M9|1mqb#PF3MBY$iQu!@Dplg@&mCPiH(am+0JfsTEBOYA^p2)+Vnq=;p|_PUNVG<@3{F+_J94$F;pIU z)kSdfyy$rC6=C=k$g8_Se0_nl2T%-U0E*$yU8`Hv#`;yt2B@1$qqBFB#>4<3Rg|_M zR#fUDx^ZmmW45C_H~{Bhu~NWpw+2dZhiQw;Bbh-uF*KQC@vxkV5~K&1a;jDsHn zM4BLZ>oW9gnUQ-#{7!+s}Y&c_voAC5<) zhPR(d8uN3W2~(*$9gVcEp*|k(i?wOk^C-Z` Date: Tue, 2 Feb 2021 10:39:47 +0100 Subject: [PATCH 17/86] added method to list the known vocabulary names --- .../eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index fac55189b..f81181e53 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -67,6 +67,10 @@ public class VocabularyGroup implements Serializable { private final Map vocs = new HashMap<>(); + public Set vocabularyNames() { + return vocs.keySet(); + } + public void addVocabulary(final String id, final String name) { vocs.put(id.toLowerCase(), new Vocabulary(id, name)); } From d62ea1490d494393a730cf10ec61d592ca21e4a5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 10:53:19 +0100 Subject: [PATCH 18/86] cleaned up RabbitMQ stuff --- dhp-common/pom.xml | 5 - .../main/java/eu/dnetlib/message/Message.java | 76 ---------- .../eu/dnetlib/message/MessageConsumer.java | 47 ------ .../eu/dnetlib/message/MessageManager.java | 136 ------------------ .../java/eu/dnetlib/message/MessageType.java | 6 - .../java/eu/dnetlib/message/MessageTest.java | 51 ------- pom.xml | 5 - 7 files changed, 326 deletions(-) delete mode 100644 dhp-common/src/main/java/eu/dnetlib/message/Message.java delete mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java delete mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java delete mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageType.java delete mode 100644 dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 6eb2e0358..a8607a9b3 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -53,11 +53,6 @@ com.fasterxml.jackson.core jackson-databind
- - - com.rabbitmq - amqp-client - net.sf.saxon Saxon-HE diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java deleted file mode 100644 index fc1c38291..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ /dev/null @@ -1,76 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.Map; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -public class Message { - - private String workflowId; - - private String jobName; - - private MessageType type; - - private Map body; - - public static Message fromJson(final String json) throws IOException { - final ObjectMapper jsonMapper = new ObjectMapper(); - return jsonMapper.readValue(json, Message.class); - } - - public Message() { - } - - public Message(String workflowId, String jobName, MessageType type, Map body) { - this.workflowId = workflowId; - this.jobName = jobName; - this.type = type; - this.body = body; - } - - public String getWorkflowId() { - return workflowId; - } - - public void setWorkflowId(String workflowId) { - this.workflowId = workflowId; - } - - public String getJobName() { - return jobName; - } - - public void setJobName(String jobName) { - this.jobName = jobName; - } - - public MessageType getType() { - return type; - } - - public void setType(MessageType type) { - this.type = type; - } - - public Map getBody() { - return body; - } - - public void setBody(Map body) { - this.body = body; - } - - @Override - public String toString() { - final ObjectMapper jsonMapper = new ObjectMapper(); - try { - return jsonMapper.writeValueAsString(this); - } catch (JsonProcessingException e) { - return null; - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java deleted file mode 100644 index fb3f0bd95..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.LinkedBlockingQueue; - -import com.rabbitmq.client.AMQP; -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.DefaultConsumer; -import com.rabbitmq.client.Envelope; - -public class MessageConsumer extends DefaultConsumer { - - final LinkedBlockingQueue queueMessages; - - /** - * Constructs a new instance and records its association to the passed-in channel. - * - * @param channel the channel to which this consumer is attached - * @param queueMessages - */ - public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { - super(channel); - this.queueMessages = queueMessages; - } - - @Override - public void handleDelivery( - String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) - throws IOException { - final String json = new String(body, StandardCharsets.UTF_8); - Message message = Message.fromJson(json); - try { - this.queueMessages.put(message); - System.out.println("Receiving Message " + message); - } catch (InterruptedException e) { - if (message.getType() == MessageType.REPORT) - throw new RuntimeException("Error on sending message"); - else { - // TODO LOGGING EXCEPTION - } - } finally { - getChannel().basicAck(envelope.getDeliveryTag(), false); - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java deleted file mode 100644 index 5ca79f3cc..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ /dev/null @@ -1,136 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeoutException; - -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.Connection; -import com.rabbitmq.client.ConnectionFactory; - -public class MessageManager { - - private final String messageHost; - - private final String username; - - private final String password; - - private Connection connection; - - private final Map channels = new HashMap<>(); - - private boolean durable; - - private boolean autodelete; - - private final LinkedBlockingQueue queueMessages; - - public MessageManager( - String messageHost, - String username, - String password, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - } - - public MessageManager( - String messageHost, - String username, - String password, - boolean durable, - boolean autodelete, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - - this.durable = durable; - this.autodelete = autodelete; - } - - private Connection createConnection() throws IOException, TimeoutException { - ConnectionFactory factory = new ConnectionFactory(); - factory.setHost(this.messageHost); - factory.setUsername(this.username); - factory.setPassword(this.password); - return factory.newConnection(); - } - - private Channel createChannel( - final Connection connection, - final String queueName, - final boolean durable, - final boolean autodelete) - throws Exception { - Map args = new HashMap<>(); - args.put("x-message-ttl", 10000); - Channel channel = connection.createChannel(); - channel.queueDeclare(queueName, durable, false, this.autodelete, args); - return channel; - } - - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) - throws Exception { - if (channels.containsKey(queueName)) { - return channels.get(queueName); - } - - if (this.connection == null) { - this.connection = createConnection(); - } - channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); - return channels.get(queueName); - } - - public void close() throws IOException { - channels - .values() - .forEach( - ch -> { - try { - ch.close(); - } catch (Exception e) { - // TODO LOG - } - }); - - this.connection.close(); - } - - public boolean sendMessage(final Message message, String queueName) throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public boolean sendMessage( - final Message message, String queueName, boolean durable_var, boolean autodelete_var) - throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public void startConsumingMessage( - final String queueName, final boolean durable, final boolean autodelete) throws Exception { - - Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java deleted file mode 100644 index 72cbda252..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.message; - -public enum MessageType { - ONGOING, REPORT -} diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java deleted file mode 100644 index 442f7b5c2..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ /dev/null @@ -1,51 +0,0 @@ - -package eu.dnetlib.message; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.junit.jupiter.api.Test; - -public class MessageTest { - - @Test - public void fromJsonTest() throws IOException { - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - System.out.println("m = " + m); - Message m1 = Message.fromJson(m.toString()); - assertEquals(m1.getWorkflowId(), m.getWorkflowId()); - assertEquals(m1.getType(), m.getType()); - assertEquals(m1.getJobName(), m.getJobName()); - - assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); - assertEquals(m1.getJobName(), m.getJobName()); - } - - @Test - public void toStringTest() { - final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - - assertEquals(expectedJson, m.toString()); - } -} diff --git a/pom.xml b/pom.xml index 3e0626aed..cfe1edfbd 100644 --- a/pom.xml +++ b/pom.xml @@ -374,11 +374,6 @@ provided - - com.rabbitmq - amqp-client - 5.6.0 - com.jayway.jsonpath json-path From 0634674add8c18d393e65ef68d200ba2be3bd6da Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 2 Feb 2021 12:12:14 +0100 Subject: [PATCH 19/86] implemented transformation test --- .../GenerateDataciteDatasetSpark.scala | 2 +- .../transformation/TransformSparkJobNode.java | 15 +- .../transformation/TransformationFactory.java | 4 +- .../oozie_app/config-default.xml | 5 +- .../dhp/transformation/oozie_app/workflow.xml | 53 ++++- .../dhp/aggregation/AggregationJobTest.java | 197 ++++++++++++++++++ .../GenerateNativeStoreSparkJobTest.java | 169 --------------- .../transformation/TransformationJobTest.java | 4 + .../dhp/collection/mdStoreCleanedVersion.json | 9 + 9 files changed, 275 insertions(+), 183 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java delete mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala index 6837e94b2..f04f92c63 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -27,7 +27,7 @@ object GenerateDataciteDatasetSpark { val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) - + log.info(s"vocabulary size is ${vocabularies.getTerms("dnet:languages").size()}") val spark: SparkSession = SparkSession.builder().config(conf) .appName(GenerateDataciteDatasetSpark.getClass.getSimpleName) .master(master) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index b9df902a1..193da3878 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -24,6 +24,7 @@ import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -60,15 +61,23 @@ public class TransformSparkJobNode { final String isLookupUrl = parser.get("isLookupUrl"); log.info(String.format("isLookupUrl: %s", isLookupUrl)); + final String dateOfTransformation = parser.get("dateOfTransformation"); + log.info(String.format("dateOfTransformation: %s", dateOfTransformation)); + + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size()); + SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> transformRecords( - parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath(), - cleanedMdStoreVersion.getHdfsPath())); + parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath() + "/store", + cleanedMdStoreVersion.getHdfsPath() + "/store")); } public static void transformRecords(final Map args, final ISLookUpService isLookUpService, @@ -82,7 +91,7 @@ public class TransformSparkJobNode { final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); final MapFunction XSLTTransformationFunction = TransformationFactory - .getTransformationPlugin(args, ct, isLookUpService); + .getTransformationPlugin(args, ct, isLookUpService); mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath + "/store"); log.info("Transformed item " + ct.getProcessedItems().count()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java index d1f896964..45ba2981f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -18,7 +18,7 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class TransformationFactory { private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); - public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/text()"; + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]"; public static MapFunction getTransformationPlugin( final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) @@ -57,7 +57,7 @@ public class TransformationFactory { private static String queryTransformationRuleFromIS(final String transformationRuleId, final ISLookUpService isLookUpService) throws Exception { final String query = String.format(TRULE_XQUERY, transformationRuleId); - log.info("asking query to IS: " + query); + System.out.println("asking query to IS: " + query); List result = isLookUpService.quickSearchProfile(query); if (result == null || result.isEmpty()) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml index e77dd09c9..bdd48b0ab 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml @@ -15,8 +15,5 @@ oozie.action.sharelib.for.spark spark2 - - oozie.launcher.mapreduce.user.classpath.first - true - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index aff87dc79..43b270eaf 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -18,12 +18,17 @@ transformationPlugin + XSLT_TRANSFORM The transformation Plugin dateOfTransformation The timestamp of the transformation date + + isLookupUrl + The IS lookUp service endopoint + @@ -35,22 +40,36 @@ + + + oozie.launcher.mapreduce.user.classpath.first + true + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionREAD_LOCK --mdStoreID${mdStoreInputId} --mdStoreManagerURI${mdStoreManagerURI} + + + + oozie.launcher.mapreduce.user.classpath.first + true + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionNEW_VERSION --mdStoreID${mdStoreOutputId} --mdStoreManagerURI${mdStoreManagerURI} + @@ -62,7 +81,7 @@ cluster Transform MetadataStore eu.dnetlib.dhp.transformation.TransformSparkJobNode - dhp-aggregations-${projectVersion}.jar + dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} @@ -72,11 +91,12 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --mdstoreInputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} - --mdstoreOutputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdstoreInputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} --dateOfTransformation${dateOfTransformation} --transformationPlugin${transformationPlugin} --transformationRuleId${transformationRuleId} + --isLookupUrl${isLookupUrl} @@ -84,6 +104,13 @@ + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} @@ -96,6 +123,12 @@ + + + oozie.launcher.mapreduce.user.classpath.first + true + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionCOMMIT --namenode${nameNode} @@ -108,18 +141,30 @@ + + + oozie.launcher.mapreduce.user.classpath.first + true + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} - + + + + oozie.launcher.mapreduce.user.classpath.first + true + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode --actionROLLBACK --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java new file mode 100644 index 000000000..c9ccbc7ff --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -0,0 +1,197 @@ + +package eu.dnetlib.dhp.aggregation; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.transformation.TransformSparkJobNode; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +public class AggregationJobTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static Encoder encoder; + + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; + + private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + provenance = IOUtils.toString(AggregationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + workingDir = Files.createTempDirectory(AggregationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + + conf.setAppName(AggregationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + encoder = Encoders.bean(MetadataRecord.class); + spark = SparkSession + .builder() + .appName(AggregationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + @Order(1) + public void testGenerateNativeStoreSparkJobRefresh() throws Exception { + + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); + + GenerateNativeStoreSparkJob + .main( + new String[]{ + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-readMdStoreVersion", "", + "-workflowId", "abc" + }); + + verify(mdStoreV1); + } + + @Test + @Order(2) + public void testGenerateNativeStoreSparkJobIncremental() throws Exception { + + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); + + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); + + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + + GenerateNativeStoreSparkJob + .main( + new String[]{ + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-workflowId", "abc" + }); + + verify(mdStoreV2); + } + + + //@Test + @Order(3) + public void testTransformSparkJob() throws Exception { + + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); + + TransformSparkJobNode.main(new String[]{ + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-dateOfTransformation", dateOfCollection, + "-mdstoreInputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-mdstoreOutputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreCleanedVersion), + "-transformationPlugin", "XSLT_TRANSFORM", + "-isLookupUrl", "https://dev-openaire.d4science.org/is/services/isLookUp", + "-transformationRuleId", "183dde52-a69b-4db9-a07e-1ef2be105294_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="}); + + } + + protected void verify(MDStoreVersion mdStoreVersion) throws IOException { + Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + long seqFileSize = sc + .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) + .count(); + + final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); + long mdStoreSize = mdstore.count(); + + long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); + + Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); + Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); + + long uniqueIds = mdstore + .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) + .distinct() + .count(); + + Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); + } + + private MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java deleted file mode 100644 index 715ad8fa6..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java +++ /dev/null @@ -1,169 +0,0 @@ - -package eu.dnetlib.dhp.collection; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; - -@TestMethodOrder(MethodOrderer.OrderAnnotation.class) -public class GenerateNativeStoreSparkJobTest { - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private static SparkSession spark; - - private static Path workingDir; - - private static Encoder encoder; - - private static final String encoding = "XML"; - private static final String dateOfCollection = System.currentTimeMillis() + ""; - private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; - private static String provenance; - - private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJobTest.class); - - @BeforeAll - public static void beforeAll() throws IOException { - provenance = IOUtils.toString(GenerateNativeStoreSparkJobTest.class.getResourceAsStream("provenance.json")); - workingDir = Files.createTempDirectory(GenerateNativeStoreSparkJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); - - SparkConf conf = new SparkConf(); - - conf.setAppName(GenerateNativeStoreSparkJobTest.class.getSimpleName()); - - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - - encoder = Encoders.bean(MetadataRecord.class); - spark = SparkSession - .builder() - .appName(GenerateNativeStoreSparkJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } - - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } - - @Test - @Order(1) - public void testGenerateNativeStoreSparkJobRefresh() throws Exception { - - MDStoreVersion mdStoreV1 = prepareVersion("mdStoreVersion_1.json"); - FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); - - IOUtils - .copy( - getClass().getResourceAsStream("sequence_file"), - new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); - - GenerateNativeStoreSparkJob - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-encoding", encoding, - "-dateOfCollection", dateOfCollection, - "-provenance", provenance, - "-xpath", xpath, - "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), - "-readMdStoreVersion", "", - "-workflowId", "abc" - }); - - verify(mdStoreV1); - } - - @Test - @Order(2) - public void testGenerateNativeStoreSparkJobIncremental() throws Exception { - - MDStoreVersion mdStoreV2 = prepareVersion("mdStoreVersion_2.json"); - FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); - - IOUtils - .copy( - getClass().getResourceAsStream("sequence_file"), - new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); - - MDStoreVersion mdStoreV1 = prepareVersion("mdStoreVersion_1.json"); - - GenerateNativeStoreSparkJob - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-encoding", encoding, - "-dateOfCollection", dateOfCollection, - "-provenance", provenance, - "-xpath", xpath, - "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), - "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), - "-workflowId", "abc" - }); - - verify(mdStoreV2); - } - - protected void verify(MDStoreVersion mdStoreVersion) throws IOException { - Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); - - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - long seqFileSize = sc - .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) - .count(); - - final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); - long mdStoreSize = mdstore.count(); - - long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); - - Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); - Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); - - long uniqueIds = mdstore - .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) - .distinct() - .count(); - - Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); - } - - private MDStoreVersion prepareVersion(String filename) throws IOException { - MDStoreVersion mdstore = OBJECT_MAPPER - .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); - mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); - return mdstore; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 6a80e01e2..9e46b5f95 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -38,6 +38,7 @@ import eu.dnetlib.dhp.collection.CollectionJobTest; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -74,6 +75,9 @@ public class TransformationJobTest { spark.stop(); } + + + @Test @DisplayName("Test Transform Single XML using XSLTTransformator") public void testTransformSaxonHE() throws Exception { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json new file mode 100644 index 000000000..a5adc8fda --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json @@ -0,0 +1,9 @@ +{ + "id":"md-cleaned", + "mdstore":"md-cleaned", + "writing":false, + "readCount":1, + "lastUpdate":1612187563099, + "size":71, + "hdfsPath":"%s/mdstore/md-cleaned" +} \ No newline at end of file From 75807ea5ae69a1776b65ff3b31a18db127e80835 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 12:28:21 +0100 Subject: [PATCH 20/86] factored out constants --- .../common/AggregationConstants.java | 15 +++++ .../common/AggregationUtility.java | 3 + .../GenerateNativeStoreSparkJob.java | 55 +++++++------------ .../worker/CollectorWorkerApplication.java | 4 +- .../transformation/TransformSparkJobNode.java | 32 +++++++---- .../dhp/aggregation/AggregationJobTest.java | 2 +- 6 files changed, 63 insertions(+), 48 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java new file mode 100644 index 000000000..15e0bb454 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.aggregation.common; + +public class AggregationConstants { + + public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String MDSTORE_DATA_PATH = "/store"; + public static final String MDSTORE_SIZE_PATH = "/size"; + + public static final String CONTENT_TOTALITEMS = "TotalItems"; + public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; + public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; + + + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java index eb971c475..d657dee02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java @@ -5,6 +5,7 @@ import java.io.BufferedOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -21,6 +22,8 @@ public class AggregationUtility { private static final Logger log = LoggerFactory.getLogger(AggregationUtility.class); + public static final ObjectMapper MAPPER = new ObjectMapper(); + public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path) throws IOException { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index bbed36a9c..13813623c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,15 +1,11 @@ package eu.dnetlib.dhp.collection; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Objects; -import java.util.Optional; - +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -26,26 +22,22 @@ import org.dom4j.Node; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import net.sf.saxon.expr.Component; import scala.Tuple2; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); - private static final ObjectMapper MAPPER = new ObjectMapper(); - - private static final String DATASET_NAME = "/store"; - public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -88,11 +80,6 @@ public class GenerateNativeStoreSparkJob { log.info("isSparkSessionManaged: {}", isSparkSessionManaged); SparkConf conf = new SparkConf(); - /* - * conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf .registerKryoClasses( new - * Class[] { MetadataRecord.class, Provenance.class }); - */ - runWithSparkSession( conf, isSparkSessionManaged, @@ -109,10 +96,10 @@ public class GenerateNativeStoreSparkJob { MDStoreVersion readVersion) throws IOException { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + final LongAccumulator totalItems = sc.sc().longAccumulator(CONTENT_TOTALITEMS); + final LongAccumulator invalidRecords = sc.sc().longAccumulator(CONTENT_INVALIDRECORDS); - final String seqFilePath = currentVersion.getHdfsPath() + CollectorWorkerApplication.SEQUENCE_FILE_NAME; + final String seqFilePath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME; final JavaRDD nativeStore = sc .sequenceFile(seqFilePath, IntWritable.class, Text.class) .map( @@ -130,13 +117,13 @@ public class GenerateNativeStoreSparkJob { final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mdstore = spark.createDataset(nativeStore.rdd(), encoder); - final String targetPath = currentVersion.getHdfsPath() + DATASET_NAME; + final String targetPath = currentVersion.getHdfsPath() + MDSTORE_DATA_PATH; if (readVersion != null) { // INCREMENTAL MODE log.info("updating {} incrementally with {}", targetPath, readVersion.getHdfsPath()); Dataset currentMdStoreVersion = spark .read() - .load(readVersion.getHdfsPath() + DATASET_NAME) + .load(readVersion.getHdfsPath() + MDSTORE_DATA_PATH) .as(encoder); TypedColumn aggregator = new MDStoreAggregator().toColumn(); @@ -159,7 +146,7 @@ public class GenerateNativeStoreSparkJob { final Long total = spark.read().load(targetPath).count(); log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName()); - writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + "/size"); + writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH); } public static class MDStoreAggregator extends Aggregator { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index e24b9ad1d..da5b197d6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.collection.worker; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; + import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,8 +27,6 @@ public class CollectorWorkerApplication { private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - public static String SEQUENCE_FILE_NAME = "/sequence_file"; - /** * @param args */ diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 193da3878..f8ddf47e2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -2,14 +2,17 @@ package eu.dnetlib.dhp.transformation; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; import java.io.IOException; import java.util.Map; import java.util.Optional; +import eu.dnetlib.dhp.aggregation.common.AggregationConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; + import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; @@ -76,29 +79,36 @@ public class TransformSparkJobNode { conf, isSparkSessionManaged, spark -> transformRecords( - parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath() + "/store", - cleanedMdStoreVersion.getHdfsPath() + "/store")); + parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH, + cleanedMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH)); } public static void transformRecords(final Map args, final ISLookUpService isLookUpService, final SparkSession spark, final String inputPath, final String outputPath) throws DnetTransformationException, IOException { - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); + final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS); final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); - final MapFunction XSLTTransformationFunction = TransformationFactory - .getTransformationPlugin(args, ct, isLookUpService); - mdstoreInput.map(XSLTTransformationFunction, encoder).write().save(outputPath + "/store"); + + saveDataset( + spark.read() + .format("parquet") + .load(inputPath) + .as(encoder) + .map( + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder), + outputPath + MDSTORE_DATA_PATH); + log.info("Transformed item " + ct.getProcessedItems().count()); log.info("Total item " + ct.getTotalItems().count()); log.info("Transformation Error item " + ct.getErrorItems().count()); - AggregationUtility.writeTotalSizeOnHDFS(spark, ct.getProcessedItems().count(), outputPath + "/size"); + writeTotalSizeOnHDFS(spark, ct.getProcessedItems().count(), outputPath + MDSTORE_SIZE_PATH); } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java index c9ccbc7ff..ac65ef6a9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -145,7 +145,7 @@ public class AggregationJobTest { } - //@Test + @Test @Order(3) public void testTransformSparkJob() throws Exception { From bb89b99b24d4ad7e2bf05d383c87a74874af4929 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 12:34:14 +0100 Subject: [PATCH 21/86] code formatting --- .../common/AggregationConstants.java | 15 +- .../common/AggregationUtility.java | 3 +- .../GenerateNativeStoreSparkJob.java | 32 +-- .../transformation/TransformSparkJobNode.java | 28 +- .../dhp/aggregation/AggregationJobTest.java | 250 +++++++++--------- .../transformation/TransformationJobTest.java | 3 - 6 files changed, 164 insertions(+), 167 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java index 15e0bb454..7c5ad354d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java @@ -1,15 +1,14 @@ + package eu.dnetlib.dhp.aggregation.common; public class AggregationConstants { - public static final String SEQUENCE_FILE_NAME = "/sequence_file"; - public static final String MDSTORE_DATA_PATH = "/store"; - public static final String MDSTORE_SIZE_PATH = "/size"; - - public static final String CONTENT_TOTALITEMS = "TotalItems"; - public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; - public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; - + public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String MDSTORE_DATA_PATH = "/store"; + public static final String MDSTORE_SIZE_PATH = "/size"; + public static final String CONTENT_TOTALITEMS = "TotalItems"; + public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; + public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java index d657dee02..7332ac071 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java @@ -5,7 +5,6 @@ import java.io.BufferedOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -15,6 +14,8 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 13813623c..fdf3965d6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,11 +1,16 @@ package eu.dnetlib.dhp.collection; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Objects; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -22,18 +27,15 @@ import org.dom4j.Node; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; import scala.Tuple2; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class GenerateNativeStoreSparkJob { private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index f8ddf47e2..0a01faf1e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,19 +1,17 @@ package eu.dnetlib.dhp.transformation; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.saveDataset; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.writeTotalSizeOnHDFS; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; import java.util.Map; import java.util.Optional; -import eu.dnetlib.dhp.aggregation.common.AggregationConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; - -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; @@ -25,7 +23,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; @@ -67,7 +64,6 @@ public class TransformSparkJobNode { final String dateOfTransformation = parser.get("dateOfTransformation"); log.info(String.format("dateOfTransformation: %s", dateOfTransformation)); - final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); @@ -94,15 +90,15 @@ public class TransformSparkJobNode { final Encoder encoder = Encoders.bean(MetadataRecord.class); saveDataset( - spark.read() - .format("parquet") - .load(inputPath) - .as(encoder) - .map( - TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), - encoder), - outputPath + MDSTORE_DATA_PATH); - + spark + .read() + .format("parquet") + .load(inputPath) + .as(encoder) + .map( + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder), + outputPath + MDSTORE_DATA_PATH); log.info("Transformed item " + ct.getProcessedItems().count()); log.info("Total item " + ct.getTotalItems().count()); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java index ac65ef6a9..d5ecc9cb0 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -12,11 +12,6 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.transformation.TransformSparkJobNode; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.IntWritable; @@ -35,163 +30,170 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.TransformSparkJobNode; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class AggregationJobTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static Encoder encoder; + private static Encoder encoder; - private static final String encoding = "XML"; - private static final String dateOfCollection = System.currentTimeMillis() + ""; - private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; - private static String provenance; + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; - private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - provenance = IOUtils.toString(AggregationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); - workingDir = Files.createTempDirectory(AggregationJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + provenance = IOUtils + .toString(AggregationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + workingDir = Files.createTempDirectory(AggregationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - conf.setAppName(AggregationJobTest.class.getSimpleName()); + conf.setAppName(AggregationJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - encoder = Encoders.bean(MetadataRecord.class); - spark = SparkSession - .builder() - .appName(AggregationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + encoder = Encoders.bean(MetadataRecord.class); + spark = SparkSession + .builder() + .appName(AggregationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - @Order(1) - public void testGenerateNativeStoreSparkJobRefresh() throws Exception { + @Test + @Order(1) + public void testGenerateNativeStoreSparkJobRefresh() throws Exception { - MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); - FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath())); - IOUtils - .copy( - getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), - new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file")); - GenerateNativeStoreSparkJob - .main( - new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-encoding", encoding, - "-dateOfCollection", dateOfCollection, - "-provenance", provenance, - "-xpath", xpath, - "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), - "-readMdStoreVersion", "", - "-workflowId", "abc" - }); + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-readMdStoreVersion", "", + "-workflowId", "abc" + }); - verify(mdStoreV1); - } + verify(mdStoreV1); + } - @Test - @Order(2) - public void testGenerateNativeStoreSparkJobIncremental() throws Exception { + @Test + @Order(2) + public void testGenerateNativeStoreSparkJobIncremental() throws Exception { - MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); - FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath())); - IOUtils - .copy( - getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), - new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); + IOUtils + .copy( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"), + new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file")); - MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); + MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json"); - GenerateNativeStoreSparkJob - .main( - new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-encoding", encoding, - "-dateOfCollection", dateOfCollection, - "-provenance", provenance, - "-xpath", xpath, - "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), - "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), - "-workflowId", "abc" - }); + GenerateNativeStoreSparkJob + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-encoding", encoding, + "-dateOfCollection", dateOfCollection, + "-provenance", provenance, + "-xpath", xpath, + "-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1), + "-workflowId", "abc" + }); - verify(mdStoreV2); - } + verify(mdStoreV2); + } + @Test + @Order(3) + public void testTransformSparkJob() throws Exception { - @Test - @Order(3) - public void testTransformSparkJob() throws Exception { + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); + MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); - MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); - MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); + TransformSparkJobNode.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-dateOfTransformation", dateOfCollection, + "-mdstoreInputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), + "-mdstoreOutputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreCleanedVersion), + "-transformationPlugin", "XSLT_TRANSFORM", + "-isLookupUrl", "https://dev-openaire.d4science.org/is/services/isLookUp", + "-transformationRuleId", + "183dde52-a69b-4db9-a07e-1ef2be105294_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" + }); - TransformSparkJobNode.main(new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-dateOfTransformation", dateOfCollection, - "-mdstoreInputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), - "-mdstoreOutputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreCleanedVersion), - "-transformationPlugin", "XSLT_TRANSFORM", - "-isLookupUrl", "https://dev-openaire.d4science.org/is/services/isLookUp", - "-transformationRuleId", "183dde52-a69b-4db9-a07e-1ef2be105294_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="}); + } - } + protected void verify(MDStoreVersion mdStoreVersion) throws IOException { + Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); - protected void verify(MDStoreVersion mdStoreVersion) throws IOException { - Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + long seqFileSize = sc + .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) + .count(); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - long seqFileSize = sc - .sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class) - .count(); + final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); + long mdStoreSize = mdstore.count(); - final Dataset mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder); - long mdStoreSize = mdstore.count(); + long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); - long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size"))); + Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); + Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); - Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal"); - Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal"); + long uniqueIds = mdstore + .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) + .distinct() + .count(); - long uniqueIds = mdstore - .map((MapFunction) MetadataRecord::getId, Encoders.STRING()) - .distinct() - .count(); + Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); + } - Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); - } - - private MDStoreVersion prepareVersion(String filename) throws IOException { - MDStoreVersion mdstore = OBJECT_MAPPER - .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); - mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); - return mdstore; - } + private MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 9e46b5f95..d03c3acef 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -75,9 +75,6 @@ public class TransformationJobTest { spark.stop(); } - - - @Test @DisplayName("Test Transform Single XML using XSLTTransformator") public void testTransformSaxonHE() throws Exception { From ca4391aa1c5c03ecb0477fa287b77da97e3f9c8b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 12:44:04 +0100 Subject: [PATCH 22/86] minor changes --- .../transformation/TransformSparkJobNode.java | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 0a01faf1e..51f69de10 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -2,8 +2,7 @@ package eu.dnetlib.dhp.transformation; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.saveDataset; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.writeTotalSizeOnHDFS; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; @@ -19,8 +18,6 @@ import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -52,11 +49,14 @@ public class TransformSparkJobNode { final String mdstoreInputVersion = parser.get("mdstoreInputVersion"); final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion"); - // TODO this variable will be used after implementing Messaging with DNet Aggregator - final ObjectMapper jsonMapper = new ObjectMapper(); - final MDStoreVersion nativeMdStoreVersion = jsonMapper.readValue(mdstoreInputVersion, MDStoreVersion.class); - final MDStoreVersion cleanedMdStoreVersion = jsonMapper.readValue(mdstoreOutputVersion, MDStoreVersion.class); + final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); + final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; + log.info("input path: {}", inputPath); + + final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); + final String outputPath = cleanedMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; + log.info("output path: {}", outputPath); final String isLookupUrl = parser.get("isLookupUrl"); log.info(String.format("isLookupUrl: %s", isLookupUrl)); @@ -74,9 +74,10 @@ public class TransformSparkJobNode { runWithSparkSession( conf, isSparkSessionManaged, - spark -> transformRecords( - parser.getObjectMap(), isLookupService, spark, nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH, - cleanedMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH)); + spark -> { + transformRecords( + parser.getObjectMap(), isLookupService, spark, inputPath, outputPath); + }); } public static void transformRecords(final Map args, final ISLookUpService isLookUpService, From bde14b149a5e1d5eb249ef80db9d6d1a10d670a7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 12:49:29 +0100 Subject: [PATCH 23/86] fixed transformation target paths --- .../transformation/TransformSparkJobNode.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 51f69de10..e1830ed28 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -11,6 +11,7 @@ import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; @@ -52,11 +53,11 @@ public class TransformSparkJobNode { final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; - log.info("input path: {}", inputPath); + log.info("inputPath: {}", inputPath); final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); - final String outputPath = cleanedMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; - log.info("output path: {}", outputPath); + final String outputBasePath = cleanedMdStoreVersion.getHdfsPath(); + log.info("outputBasePath: {}", outputBasePath); final String isLookupUrl = parser.get("isLookupUrl"); log.info(String.format("isLookupUrl: %s", isLookupUrl)); @@ -76,12 +77,12 @@ public class TransformSparkJobNode { isSparkSessionManaged, spark -> { transformRecords( - parser.getObjectMap(), isLookupService, spark, inputPath, outputPath); + parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath); }); } public static void transformRecords(final Map args, final ISLookUpService isLookUpService, - final SparkSession spark, final String inputPath, final String outputPath) + final SparkSession spark, final String inputPath, final String outputBasePath) throws DnetTransformationException, IOException { final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS); @@ -90,22 +91,21 @@ public class TransformSparkJobNode { final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); final Encoder encoder = Encoders.bean(MetadataRecord.class); - saveDataset( - spark + final Dataset mdstore = spark .read() .format("parquet") .load(inputPath) .as(encoder) .map( - TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), - encoder), - outputPath + MDSTORE_DATA_PATH); + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder); + saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH); log.info("Transformed item " + ct.getProcessedItems().count()); log.info("Total item " + ct.getTotalItems().count()); log.info("Transformation Error item " + ct.getErrorItems().count()); - writeTotalSizeOnHDFS(spark, ct.getProcessedItems().count(), outputPath + MDSTORE_SIZE_PATH); + writeTotalSizeOnHDFS(spark, mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH); } } From ac46c247d2261c2dc2a1c5845d6355ca5088537f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 14:24:00 +0100 Subject: [PATCH 24/86] code formatting --- .../dhp/transformation/TransformSparkJobNode.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index e1830ed28..e1b1b849c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -92,13 +92,13 @@ public class TransformSparkJobNode { final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mdstore = spark - .read() - .format("parquet") - .load(inputPath) - .as(encoder) - .map( - TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), - encoder); + .read() + .format("parquet") + .load(inputPath) + .as(encoder) + .map( + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder); saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH); log.info("Transformed item " + ct.getProcessedItems().count()); From 53884d12c29d8ba746c4e6ebb68492b4212a1c45 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 2 Feb 2021 14:38:03 +0100 Subject: [PATCH 25/86] code formatting --- .../dhp/collection/oozie_app/workflow.xml | 19 +++---------------- .../dhp/transformation/oozie_app/workflow.xml | 11 ++++------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 9c213bee5..2b2cf9dce 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -4,7 +4,6 @@ apiDescription A json encoding of the API Description class - dataSourceInfo A json encoding of the Datasource Info @@ -13,50 +12,43 @@ identifierPath An xpath to retrieve the metadata identifier for the generation of DNet Identifier - metadataEncoding The type of the metadata XML/JSON - timestamp The timestamp of the collection date - workflowId The identifier of the workflow - mdStoreID The identifier of the mdStore - mdStoreManagerURI The URI of the MDStore Manager - collectionMode Should be REFRESH or INCREMENTAL - + ${jobTracker} ${nameNode} - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - ${wf:conf('collectionMode') eq 'REFRESH'} @@ -77,8 +69,6 @@ - - eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode @@ -129,7 +119,6 @@ - ${wf:conf('collectionMode') eq 'REFRESH'} @@ -182,8 +171,6 @@ - - eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode @@ -195,6 +182,6 @@ - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 43b270eaf..9e01936d4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -29,11 +29,10 @@ isLookupUrl The IS lookUp service endopoint - - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] @@ -51,11 +50,11 @@ --mdStoreID${mdStoreInputId} --mdStoreManagerURI${mdStoreManagerURI} - + @@ -69,7 +68,6 @@ --mdStoreID${mdStoreOutputId} --mdStoreManagerURI${mdStoreManagerURI} - @@ -173,8 +171,7 @@ - - - + + \ No newline at end of file From 0e8a4f9f1acbb03d1ec8c5cefcc6caff053cb532 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Feb 2021 12:33:41 +0100 Subject: [PATCH 26/86] better logging, WIP: collectorWorker error reporting --- .../dhp/application/ApplicationUtils.java | 21 +++++ .../mdstore/MDStoreActionNode.java | 32 +++---- .../collection/plugin/CollectorPlugin.java | 3 + .../plugin/oai/OaiCollectorPlugin.java | 21 ++++- .../collection/plugin/oai/OaiIterator.java | 17 +++- .../plugin/oai/OaiIteratorFactory.java | 6 +- .../collection/worker/CollectorWorker.java | 87 +++++++++---------- .../worker/CollectorWorkerApplication.java | 20 +++-- .../worker/utils/CollectorPluginFactory.java | 2 +- .../worker/utils/HttpConnector.java | 84 +++++++----------- .../DnetCollectorWorkerApplicationTests.java | 2 +- 11 files changed, 159 insertions(+), 136 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java new file mode 100644 index 000000000..531c13af3 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.application; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.Properties; + +public class ApplicationUtils { + + public static void populateOOZIEEnv(final String paramName, String value) throws Exception { + File file = new File(System.getProperty("oozie.action.output.properties")); + Properties props = new Properties(); + + props.setProperty(paramName, value); + OutputStream os = new FileOutputStream(file); + props.store(os, ""); + os.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java index 6cb0537b2..3e471cfc8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -1,6 +1,9 @@ package eu.dnetlib.dhp.aggregation.mdstore; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.application.ApplicationUtils.*; + import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; @@ -16,11 +19,8 @@ import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.CollectorWorker; import eu.dnetlib.dhp.common.rest.DNetRestClient; public class MDStoreActionNode { @@ -28,11 +28,8 @@ public class MDStoreActionNode { enum MDAction { NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK - } - private static final ObjectMapper mapper = new ObjectMapper(); - public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion"; public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s"; @@ -48,13 +45,13 @@ public class MDStoreActionNode { final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( IOUtils .toString( - CollectorWorker.class + MDStoreActionNode.class .getResourceAsStream( "/eu/dnetlib/dhp/collection/mdstore_action_parameters.json"))); argumentParser.parseArgument(args); final MDAction action = MDAction.valueOf(argumentParser.get("action")); - log.info("Curren action is {}", action); + log.info("Current action is {}", action); final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI"); log.info("mdStoreManagerURI is {}", mdStoreManagerURI); @@ -67,7 +64,7 @@ public class MDStoreActionNode { } final MDStoreVersion currentVersion = DNetRestClient .doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); - populateOOZIEEnv(MDSTOREVERSIONPARAM, mapper.writeValueAsString(currentVersion)); + populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion)); break; } case COMMIT: { @@ -77,7 +74,7 @@ public class MDStoreActionNode { throw new IllegalArgumentException("missing or empty argument namenode"); } final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); - final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); if (StringUtils.isBlank(mdStoreVersion.getId())) { throw new IllegalArgumentException( @@ -110,7 +107,7 @@ public class MDStoreActionNode { } case ROLLBACK: { final String mdStoreVersion_params = argumentParser.get("mdStoreVersion"); - final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); if (StringUtils.isBlank(mdStoreVersion.getId())) { throw new IllegalArgumentException( @@ -127,12 +124,12 @@ public class MDStoreActionNode { } final MDStoreVersion currentVersion = DNetRestClient .doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class); - populateOOZIEEnv(MDSTOREREADLOCKPARAM, mapper.writeValueAsString(currentVersion)); + populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion)); break; } case READ_UNLOCK: { final String mdStoreVersion_params = argumentParser.get("readMDStoreId"); - final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class); + final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class); if (StringUtils.isBlank(mdStoreVersion.getId())) { throw new IllegalArgumentException( @@ -148,13 +145,4 @@ public class MDStoreActionNode { } - public static void populateOOZIEEnv(final String paramName, String value) throws Exception { - File file = new File(System.getProperty("oozie.action.output.properties")); - Properties props = new Properties(); - - props.setProperty(paramName, value); - OutputStream os = new FileOutputStream(file); - props.store(os, ""); - os.close(); - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index ba9bd662e..a0c546858 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -4,9 +4,12 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public interface CollectorPlugin { Stream collect(ApiDescriptor api) throws CollectorException; + + CollectorPluginErrorLogList getCollectionErrors(); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index a5e261553..ea74919c5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -9,12 +9,15 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import org.jetbrains.annotations.NotNull; + import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class OaiCollectorPlugin implements CollectorPlugin { @@ -26,8 +29,19 @@ public class OaiCollectorPlugin implements CollectorPlugin { private OaiIteratorFactory oaiIteratorFactory; + private final CollectorPluginErrorLogList errorLogList = new CollectorPluginErrorLogList(); + @Override public Stream collect(final ApiDescriptor api) throws CollectorException { + try { + return doCollect(api); + } catch (CollectorException e) { + errorLogList.add(e.getMessage()); + throw e; + } + } + + private Stream doCollect(ApiDescriptor api) throws CollectorException { final String baseUrl = api.getBaseUrl(); final String mdFormat = api.getParams().get(FORMAT_PARAM); final String setParam = api.getParams().get(OAI_SET_PARAM); @@ -65,7 +79,7 @@ public class OaiCollectorPlugin implements CollectorPlugin { .stream() .map( set -> getOaiIteratorFactory() - .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .newIterator(baseUrl, mdFormat, set, fromDate, untilDate, errorLogList)) .iterator(); return StreamSupport @@ -79,4 +93,9 @@ public class OaiCollectorPlugin implements CollectorPlugin { } return oaiIteratorFactory; } + + @Override + public CollectorPluginErrorLogList getCollectionErrors() { + return errorLogList; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index e54bae67d..2392dee6a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -15,15 +15,17 @@ import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; public class OaiIterator implements Iterator { - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on - // 11/24/08 5:02 PM + private static final Logger log = LoggerFactory.getLogger(OaiIterator.class); private final Queue queue = new PriorityBlockingQueue<>(); private final SAXReader reader = new SAXReader(); @@ -36,6 +38,7 @@ public class OaiIterator implements Iterator { private String token; private boolean started; private final HttpConnector httpConnector; + private CollectorPluginErrorLogList errorLogList; public OaiIterator( final String baseUrl, @@ -43,7 +46,8 @@ public class OaiIterator implements Iterator { final String set, final String fromDate, final String untilDate, - final HttpConnector httpConnector) { + final HttpConnector httpConnector, + final CollectorPluginErrorLogList errorLogList) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; this.set = set; @@ -51,6 +55,7 @@ public class OaiIterator implements Iterator { this.untilDate = untilDate; this.started = false; this.httpConnector = httpConnector; + this.errorLogList = errorLogList; } private void verifyStarted() { @@ -139,7 +144,7 @@ public class OaiIterator implements Iterator { private String downloadPage(final String url) throws CollectorException { - final String xml = httpConnector.getInputSource(url); + final String xml = httpConnector.getInputSource(url, errorLogList); Document doc; try { doc = reader.read(new StringReader(xml)); @@ -174,4 +179,8 @@ public class OaiIterator implements Iterator { return doc.valueOf("//*[local-name()='resumptionToken']"); } + + public CollectorPluginErrorLogList getErrorLogList() { + return errorLogList; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index 4a6ea7f67..eafd265d4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; public class OaiIteratorFactory { @@ -14,8 +15,9 @@ public class OaiIteratorFactory { final String mdFormat, final String set, final String fromDate, - final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); + final String untilDate, + final CollectorPluginErrorLogList errorLogList) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(), errorLogList); } private HttpConnector getHttpConnector() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index 3605bdfd6..7033cfd8e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -15,6 +15,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @@ -22,69 +23,65 @@ public class CollectorWorker { private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class); - private final CollectorPluginFactory collectorPluginFactory; - private final ApiDescriptor api; private final String hdfsuri; private final String hdfsPath; + private CollectorPlugin plugin; + public CollectorWorker( - final CollectorPluginFactory collectorPluginFactory, final ApiDescriptor api, final String hdfsuri, - final String hdfsPath) { - this.collectorPluginFactory = collectorPluginFactory; + final String hdfsPath) throws CollectorException { this.api = api; this.hdfsuri = hdfsuri; this.hdfsPath = hdfsPath; - + this.plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol()); } - public void collect() throws CollectorException { - try { - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); + public CollectorPluginErrorLogList collect() throws IOException, CollectorException { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(hdfsPath); + System.setProperty("hadoop.home.dir", "/"); + // Get the filesystem - HDFS - log.info("Created path " + hdfswritepath.toString()); + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(hdfsPath); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - } catch (Throwable e) { - throw new CollectorException("Error on collecting ", e); + log.info("Created path " + hdfswritepath.toString()); + + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + plugin + .collect(api) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } finally { + return plugin.getCollectionErrors(); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index da5b197d6..1d99689db 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -2,6 +2,8 @@ package eu.dnetlib.dhp.collection.worker; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.application.ApplicationUtils.*; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; @@ -10,7 +12,9 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @@ -25,8 +29,6 @@ public class CollectorWorkerApplication { private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); - private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - /** * @param args */ @@ -49,14 +51,16 @@ public class CollectorWorkerApplication { final String mdStoreVersion = argumentParser.get("mdStoreVersion"); log.info("mdStoreVersion is {}", mdStoreVersion); - final ObjectMapper jsonMapper = new ObjectMapper(); + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); + final String hdfsPath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME; + log.info("hdfs path is {}", hdfsPath); - final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class); + final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); - final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); - final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, - currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME); - worker.collect(); + final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath); + CollectorPluginErrorLogList errors = worker.collect(); + + populateOOZIEEnv("collectorErrors", errors.toString()); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index 6b070b191..7cbcd9b5c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -7,7 +7,7 @@ import eu.dnetlib.dhp.collection.worker.CollectorException; public class CollectorPluginFactory { - public CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException { + public static CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException { if (protocol == null) throw new CollectorException("protocol cannot be null"); switch (protocol.toLowerCase().trim()) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index ff3c18aba..fc45b4814 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -16,14 +16,14 @@ import javax.net.ssl.X509TrustManager; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; public class HttpConnector { - private static final Log log = LogFactory.getLog(HttpConnector.class); + private static final Logger log = LoggerFactory.getLogger(HttpConnector.class); private int maxNumberOfRetry = 6; private int defaultDelay = 120; // seconds @@ -45,7 +45,20 @@ public class HttpConnector { * @throws CollectorException when retrying more than maxNumberOfRetry times */ public String getInputSource(final String requestUrl) throws CollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + return attemptDownloadAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @param errorLogList the list of errors + * @return the content of the downloaded resource + * @throws CollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList) + throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, errorLogList); } /** @@ -59,18 +72,20 @@ public class HttpConnector { return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); } - private String attemptDownlaodAsString( + private String attemptDownloadAsString( final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) throws CollectorException { + + log.info("requesting URL [{}]", requestUrl); try { final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); try { return IOUtils.toString(s); } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); + log.error("error while retrieving from http-connection occurred: {}", requestUrl, e); Thread.sleep(defaultDelay * 1000); errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + return attemptDownloadAsString(requestUrl, retryNumber + 1, errorList); } finally { IOUtils.closeQuietly(s); } @@ -87,7 +102,7 @@ public class HttpConnector { throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList); } - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + log.debug("requesting URL [{}], try {}", requestUrl, retryNumber); try { InputStream input = null; @@ -103,7 +118,7 @@ public class HttpConnector { final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); + log.warn("waiting and repeating request after {} sec.", retryAfter); Thread.sleep(retryAfter * 1000); errorList.add("503 Service Unavailable"); urlConn.disconnect(); @@ -111,7 +126,7 @@ public class HttpConnector { } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); + log.debug("The requested url has been moved to {}", newUrl); errorList .add( String @@ -121,15 +136,11 @@ public class HttpConnector { urlConn.disconnect(); return attemptDownload(newUrl, retryNumber + 1, errorList); } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format( - "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + final String msg = String + .format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()); + log.error(msg); Thread.sleep(defaultDelay * 1000); - errorList - .add( - String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + errorList.add(msg); urlConn.disconnect(); return attemptDownload(requestUrl, retryNumber + 1, errorList); } else { @@ -138,7 +149,7 @@ public class HttpConnector { return input; } } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); + log.error("error while retrieving from http-connection occurred: {}", requestUrl, e); Thread.sleep(defaultDelay * 1000); errorList.add(e.getMessage()); return attemptDownload(requestUrl, retryNumber + 1, errorList); @@ -149,12 +160,12 @@ public class HttpConnector { } private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); + log.debug("StatusCode: {}", urlConn.getResponseMessage()); for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { if (e.getKey() != null) { for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); + log.debug(" key: {} value: {}", e.getKey(), v); } } } @@ -183,37 +194,6 @@ public class HttpConnector { "The requested url has been MOVED, but 'location' param is MISSING"); } - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - public int getMaxNumberOfRetry() { return maxNumberOfRetry; } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index 9abfbacac..10964096c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -40,7 +40,7 @@ public class DnetCollectorWorkerApplicationTests { public void testFeeding(@TempDir Path testDir) throws Exception { System.out.println(testDir.toString()); - CollectorWorker worker = new CollectorWorker(new CollectorPluginFactory(), getApi(), + CollectorWorker worker = new CollectorWorker(getApi(), "file://" + testDir.toString() + "/file.seq", testDir.toString() + "/file.seq"); worker.collect(); From c286d28ad2c313ba2242145b788bcaf3171712c6 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 3 Feb 2021 16:07:49 +0100 Subject: [PATCH 27/86] logs --- .../data/mdstore/manager/common/model/MDStore.java | 12 +++++++++--- .../manager/common/model/MDStoreCurrentVersion.java | 8 ++++++-- .../manager/common/model/MDStoreVersion.java | 12 +++++++++--- .../manager/common/model/MDStoreWithInfo.java | 13 ++++++++++--- 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index db200cd6a..59fe941ed 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -157,7 +157,9 @@ public class MDStore implements Serializable { @Override public String toString() { return String - .format("MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); + .format( + "MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); } @Override @@ -167,8 +169,12 @@ public class MDStore implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStore)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStore)) { + return false; + } final MDStore other = (MDStore) obj; return Objects.equals(id, other.id); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index e25e7dc2a..d808e2de7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -62,8 +62,12 @@ public class MDStoreCurrentVersion implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreCurrentVersion)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreCurrentVersion)) { + return false; + } final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 26c34fcad..38f8f275e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -116,7 +116,9 @@ public class MDStoreVersion implements Serializable { @Override public String toString() { return String - .format("MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, mdstore, writing, readCount, lastUpdate, size, hdfsPath); + .format( + "MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, + mdstore, writing, readCount, lastUpdate, size, hdfsPath); } @Override @@ -126,8 +128,12 @@ public class MDStoreVersion implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreVersion)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreVersion)) { + return false; + } final MDStoreVersion other = (MDStoreVersion) obj; return Objects.equals(id, other.id); } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index e34e4c000..510c65092 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -168,7 +168,10 @@ public class MDStoreWithInfo implements Serializable { @Override public String toString() { return String - .format("MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, lastUpdate, size, numberOfVersions, hdfsPath); + .format( + "MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", + id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, + lastUpdate, size, numberOfVersions, hdfsPath); } @Override @@ -178,8 +181,12 @@ public class MDStoreWithInfo implements Serializable { @Override public boolean equals(final Object obj) { - if (this == obj) { return true; } - if (!(obj instanceof MDStoreWithInfo)) { return false; } + if (this == obj) { + return true; + } + if (!(obj instanceof MDStoreWithInfo)) { + return false; + } final MDStoreWithInfo other = (MDStoreWithInfo) obj; return Objects.equals(id, other.id); } From 820d729e99011887ae1419c4eb4c5fe4ed0b5d6b Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 3 Feb 2021 16:20:34 +0100 Subject: [PATCH 28/86] recover of Message and MessageType class --- .../main/java/eu/dnetlib/message/Message.java | 58 +++++++++++++++++++ .../java/eu/dnetlib/message/MessageType.java | 6 ++ 2 files changed, 64 insertions(+) create mode 100644 dhp-common/src/main/java/eu/dnetlib/message/Message.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageType.java diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java new file mode 100644 index 000000000..8932e02f3 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java @@ -0,0 +1,58 @@ + +package eu.dnetlib.message; + +import java.util.Map; + +public class Message { + + private String workflowId; + + private String jobName; + + private MessageType type; + + private Map body; + + public Message() { + } + + public Message(final String workflowId, final String jobName, final MessageType type, + final Map body) { + this.workflowId = workflowId; + this.jobName = jobName; + this.type = type; + this.body = body; + } + + public String getWorkflowId() { + return workflowId; + } + + public void setWorkflowId(final String workflowId) { + this.workflowId = workflowId; + } + + public String getJobName() { + return jobName; + } + + public void setJobName(final String jobName) { + this.jobName = jobName; + } + + public MessageType getType() { + return type; + } + + public void setType(final MessageType type) { + this.type = type; + } + + public Map getBody() { + return body; + } + + public void setBody(final Map body) { + this.body = body; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java new file mode 100644 index 000000000..72cbda252 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java @@ -0,0 +1,6 @@ + +package eu.dnetlib.message; + +public enum MessageType { + ONGOING, REPORT +} From 1b9731632ba7cdc65780eba71aaebb5499faed51 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 3 Feb 2021 16:42:36 +0100 Subject: [PATCH 29/86] Message Sender --- .../main/java/eu/dnetlib/message/Message.java | 13 ++++++- .../eu/dnetlib/message/MessageSender.java | 35 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java index 8932e02f3..2de8ead42 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java @@ -1,9 +1,15 @@ package eu.dnetlib.message; +import java.io.Serializable; import java.util.Map; -public class Message { +public class Message implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 401753881204524893L; private String workflowId; @@ -55,4 +61,9 @@ public class Message { public void setBody(final Map body) { this.body = body; } + + @Override + public String toString() { + return String.format("Message [workflowId=%s, jobName=%s, type=%s, body=%s]", workflowId, jobName, type, body); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java new file mode 100644 index 000000000..020d6087f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.message; + +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.entity.SerializableEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MessageSender { + + private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + + private final String dnetMessageEndpoint; + + public MessageSender(final String dnetMessageEndpoint) { + this.dnetMessageEndpoint = dnetMessageEndpoint; + } + + public void sendMessage(final Message message) { + final HttpPut req = new HttpPut(dnetMessageEndpoint); + req.setEntity(new SerializableEntity(message)); + + try (final CloseableHttpClient client = HttpClients.createDefault(); + final CloseableHttpResponse response = client.execute(req)) { + log.debug("Sent Message to " + dnetMessageEndpoint); + log.debug("MESSAGE:" + message); + } catch (final Throwable e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } + +} From e04045089f5bfc793f15cd27188b04761a61f37a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Feb 2021 17:58:22 +0100 Subject: [PATCH 30/86] better logging, WIP: collectorWorker error reporting --- .../dhp/collection/plugin/oai/OaiIterator.java | 4 ++-- .../worker/CollectorWorkerApplication.java | 13 ++++++++----- .../dnetlib/dhp/collection/oozie_app/workflow.xml | 3 +-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 2392dee6a..df0722905 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -149,14 +149,14 @@ public class OaiIterator implements Iterator { try { doc = reader.read(new StringReader(xml)); } catch (final DocumentException e) { - log.warn("Error parsing xml, I try to clean it: " + xml, e); + log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { doc = reader.read(new StringReader(cleaned)); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { - throw new CollectorException("Error parsing cleaned document:" + cleaned, e1); + throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1); } return resumptionToken; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 1d99689db..d89bcee54 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -19,16 +19,19 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; /** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module - * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector - * plugin to use and where store the data into HDFS path + * CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes + * into HDFS. This application will be executed on the hadoop cluster, where invoked in the context of the metadata collection + * oozie workflow, it will receive all the input parameters necessary to instantiate the specific collection plugin and the + * relative specific configurations * - * @author Sandro La Bruzzo + * @author Sandro La Bruzzo, Claudio Atzori */ public class CollectorWorkerApplication { private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); + public static final String COLLECTOR_WORKER_ERRORS = "collectorWorker-errors"; + /** * @param args */ @@ -60,7 +63,7 @@ public class CollectorWorkerApplication { final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath); CollectorPluginErrorLogList errors = worker.collect(); - populateOOZIEEnv("collectorErrors", errors.toString()); + populateOOZIEEnv(COLLECTOR_WORKER_ERRORS, errors.toString()); } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 2b2cf9dce..595613a2e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -87,6 +87,7 @@ --apidescriptor${apiDescription} --namenode${nameNode} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + @@ -133,7 +134,6 @@ --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} - @@ -165,7 +165,6 @@ --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} - From 26d2eb946fc0922cb0596053df3ff77b6143a0c4 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 4 Feb 2021 09:45:46 +0100 Subject: [PATCH 31/86] messages sender --- .../eu/dnetlib/{ => dhp}/message/Message.java | 17 +++--------- .../{ => dhp}/message/MessageSender.java | 26 +++++++++++++++++-- .../java/eu/dnetlib/message/MessageType.java | 6 ----- 3 files changed, 27 insertions(+), 22 deletions(-) rename dhp-common/src/main/java/eu/dnetlib/{ => dhp}/message/Message.java (68%) rename dhp-common/src/main/java/eu/dnetlib/{ => dhp}/message/MessageSender.java (59%) delete mode 100644 dhp-common/src/main/java/eu/dnetlib/message/MessageType.java diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java similarity index 68% rename from dhp-common/src/main/java/eu/dnetlib/message/Message.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index 2de8ead42..57844d490 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -1,5 +1,5 @@ -package eu.dnetlib.message; +package eu.dnetlib.dhp.message; import java.io.Serializable; import java.util.Map; @@ -15,18 +15,15 @@ public class Message implements Serializable { private String jobName; - private MessageType type; - private Map body; public Message() { } - public Message(final String workflowId, final String jobName, final MessageType type, + public Message(final String workflowId, final String jobName, final Map body) { this.workflowId = workflowId; this.jobName = jobName; - this.type = type; this.body = body; } @@ -46,14 +43,6 @@ public class Message implements Serializable { this.jobName = jobName; } - public MessageType getType() { - return type; - } - - public void setType(final MessageType type) { - this.type = type; - } - public Map getBody() { return body; } @@ -64,6 +53,6 @@ public class Message implements Serializable { @Override public String toString() { - return String.format("Message [workflowId=%s, jobName=%s, type=%s, body=%s]", workflowId, jobName, type, body); + return String.format("Message [workflowId=%s, jobName=%s, body=%s]", workflowId, jobName, body); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java similarity index 59% rename from dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 020d6087f..70eb594f8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -1,6 +1,7 @@ -package eu.dnetlib.message; +package eu.dnetlib.dhp.message; +import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPut; import org.apache.http.entity.SerializableEntity; @@ -13,6 +14,12 @@ public class MessageSender { private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + private static final int SOCKET_TIMEOUT_MS = 2000; + + private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; + + private static final int CONNTECTION_TIMEOUT_MS = 2000; + private final String dnetMessageEndpoint; public MessageSender(final String dnetMessageEndpoint) { @@ -20,10 +27,25 @@ public class MessageSender { } public void sendMessage(final Message message) { + new Thread(() -> _sendMessage(message)).start(); + } + + private void _sendMessage(final Message message) { final HttpPut req = new HttpPut(dnetMessageEndpoint); req.setEntity(new SerializableEntity(message)); - try (final CloseableHttpClient client = HttpClients.createDefault(); + final RequestConfig requestConfig = RequestConfig + .custom() + .setConnectTimeout(CONNTECTION_TIMEOUT_MS) + .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) + .setSocketTimeout(SOCKET_TIMEOUT_MS) + .build(); + ; + + try (final CloseableHttpClient client = HttpClients + .custom() + .setDefaultRequestConfig(requestConfig) + .build(); final CloseableHttpResponse response = client.execute(req)) { log.debug("Sent Message to " + dnetMessageEndpoint); log.debug("MESSAGE:" + message); diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java deleted file mode 100644 index 72cbda252..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.message; - -public enum MessageType { - ONGOING, REPORT -} From 69c253710be8f088d7d8ce4b262db2fdbf79594d Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 4 Feb 2021 10:30:49 +0100 Subject: [PATCH 32/86] fixed test --- .../aggregation/AbstractVocabularyTest.java | 52 +++++++++ .../dhp/aggregation/AggregationJobTest.java | 105 ++++++++++++------ .../transformation/TransformationJobTest.java | 88 +++++---------- 3 files changed, 148 insertions(+), 97 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java new file mode 100644 index 000000000..84878bd1b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java @@ -0,0 +1,52 @@ +package eu.dnetlib.dhp.aggregation; + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.transformation.TransformationFactory; +import eu.dnetlib.dhp.transformation.TransformationJobTest; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.mockito.Mock; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import static org.mockito.Mockito.lenient; + +public abstract class AbstractVocabularyTest { + + @Mock + protected ISLookUpService isLookUpService; + + protected VocabularyGroup vocabularies; + + + + public void setUpVocabulary() throws ISLookUpException, IOException { + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + } + + private static List vocs() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); + } + + private static List synonyms() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); + } + + protected void mockupTrasformationRule(final String trule, final String path) throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); + + lenient() + .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) + .thenReturn(Collections.singletonList(trValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java index d5ecc9cb0..8f66b6233 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -1,44 +1,47 @@ package eu.dnetlib.dhp.aggregation; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.TransformSparkJobNode; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.HashMap; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoder; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.TransformSparkJobNode; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) -public class AggregationJobTest { +@ExtendWith(MockitoExtension.class) +public class AggregationJobTest extends AbstractVocabularyTest{ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -55,6 +58,8 @@ public class AggregationJobTest { private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); + + @BeforeAll public static void beforeAll() throws IOException { provenance = IOUtils @@ -81,6 +86,8 @@ public class AggregationJobTest { .getOrCreate(); } + + @AfterAll public static void afterAll() throws IOException { FileUtils.deleteDirectory(workingDir.toFile()); @@ -149,19 +156,45 @@ public class AggregationJobTest { @Order(3) public void testTransformSparkJob() throws Exception { + setUpVocabulary(); + MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); - TransformSparkJobNode.main(new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-dateOfTransformation", dateOfCollection, - "-mdstoreInputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2), - "-mdstoreOutputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreCleanedVersion), - "-transformationPlugin", "XSLT_TRANSFORM", - "-isLookupUrl", "https://dev-openaire.d4science.org/is/services/isLookUp", - "-transformationRuleId", - "183dde52-a69b-4db9-a07e-1ef2be105294_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" - }); + + mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdStoreV2.getHdfsPath()+MDSTORE_DATA_PATH, mdStoreCleanedVersion.getHdfsPath()); + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark.read().format("parquet").load(mdStoreCleanedVersion.getHdfsPath()+MDSTORE_DATA_PATH).as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index d03c3acef..648d7c8a1 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,20 +1,12 @@ package eu.dnetlib.dhp.transformation; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.mockito.Mockito.lenient; - -import java.io.IOException; -import java.io.StringWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import javax.xml.transform.stream.StreamSource; - +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.collection.CollectionJobTest; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -24,52 +16,42 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.collection.CollectionJobTest; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; @ExtendWith(MockitoExtension.class) -public class TransformationJobTest { +public class TransformationJobTest extends AbstractVocabularyTest { private static SparkSession spark; - @Mock - private ISLookUpService isLookUpService; - - private VocabularyGroup vocabularies; - - @BeforeEach - public void setUp() throws ISLookUpException, IOException { - lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); - - lenient() - .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) - .thenReturn(synonyms()); - vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); - } - @BeforeAll - public static void beforeAll() { + public static void beforeAll() throws IOException, ISLookUpException { SparkConf conf = new SparkConf(); conf.setAppName(CollectionJobTest.class.getSimpleName()); conf.setMaster("local"); spark = SparkSession.builder().config(conf).getOrCreate(); } + + @BeforeEach + public void setUp() throws IOException, ISLookUpException { + setUpVocabulary(); + } + @AfterAll public static void afterAll() { spark.stop(); @@ -101,8 +83,6 @@ public class TransformationJobTest { mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); -// final String arguments = "-issm true -i %s -o %s -d 1 -w 1 -tp XSLT_TRANSFORM -tr simpleTRule"; - final Map parameters = Stream.of(new String[][] { { "dateOfTransformation", "1234" @@ -111,7 +91,7 @@ public class TransformationJobTest { "transformationPlugin", "XSLT_TRANSFORM" }, { - "transformationRuleTitle", "simpleTRule" + "transformationRuleId", "simpleTRule" }, }).collect(Collectors.toMap(data -> data[0], data -> data[1])); @@ -121,7 +101,7 @@ public class TransformationJobTest { // TODO introduce useful assertions final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mOutput = spark.read().format("parquet").load(mdstore_output).as(encoder); + final Dataset mOutput = spark.read().format("parquet").load(mdstore_output+MDSTORE_DATA_PATH).as(encoder); final Long total = mOutput.count(); @@ -151,13 +131,7 @@ public class TransformationJobTest { Files.deleteIfExists(tempDirWithPrefix); } - private void mockupTrasformationRule(final String trule, final String path) throws Exception { - final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); - lenient() - .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) - .thenReturn(Collections.singletonList(trValue)); - } private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); @@ -165,13 +139,5 @@ public class TransformationJobTest { return new XSLTTransformationFunction(new AggregationCounter(la, la, la), trValue, 0, vocabularies); } - private List vocs() throws IOException { - return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); - } - private List synonyms() throws IOException { - return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); - } } From 40764cf626e316f4fba5d999421fdac0ec25c129 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 4 Feb 2021 14:06:02 +0100 Subject: [PATCH 33/86] better logging, WIP: collectorWorker error reporting --- .../dhp/application/ApplicationUtils.java | 6 +- .../ArgumentApplicationParser.java | 20 ++--- .../collection/plugin/oai/OaiIterator.java | 12 ++- .../collection/worker/CollectorWorker.java | 6 +- .../worker/CollectorWorkerApplication.java | 5 +- .../aggregation/AbstractVocabularyTest.java | 68 ++++++++-------- .../dhp/aggregation/AggregationJobTest.java | 80 ++++++++++--------- .../transformation/TransformationJobTest.java | 45 ++++++----- 8 files changed, 125 insertions(+), 117 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java index 531c13af3..72c41a062 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java @@ -1,14 +1,12 @@ package eu.dnetlib.dhp.application; -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; +import java.io.*; import java.util.Properties; public class ApplicationUtils { - public static void populateOOZIEEnv(final String paramName, String value) throws Exception { + public static void populateOOZIEEnv(final String paramName, String value) throws IOException { File file = new File(System.getProperty("oozie.action.output.properties")); Properties props = new Properties(); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index e65b4bb0b..0429bc25d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,10 +1,7 @@ package eu.dnetlib.dhp.application; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.Serializable; -import java.io.StringWriter; +import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.*; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; public class ArgumentApplicationParser implements Serializable { + private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class); + private final Options options = new Options(); private final Map objectMap = new HashMap<>(); private final List compressedValues = new ArrayList<>(); - public ArgumentApplicationParser(final String json_configuration) throws Exception { + public ArgumentApplicationParser(final String json_configuration) throws IOException { final ObjectMapper mapper = new ObjectMapper(); final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); createOptionMap(configuration); @@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable { } private void createOptionMap(final OptionsParameter[] configuration) { - Arrays .stream(configuration) .map( @@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable { return o; }) .forEach(options::addOption); - - // HelpFormatter formatter = new HelpFormatter(); - // formatter.printHelp("myapp", null, options, null, true); - } public static String decompressValue(final String abstractCompressed) { @@ -61,7 +57,7 @@ public class ArgumentApplicationParser implements Serializable { IOUtils.copy(gis, stringWriter); return stringWriter.toString(); } catch (Throwable e) { - System.out.println("Wrong value to decompress:" + abstractCompressed); + log.error("Wrong value to decompress:" + abstractCompressed); throw new RuntimeException(e); } } @@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable { return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); } - public void parseArgument(final String[] args) throws Exception { + public void parseArgument(final String[] args) throws ParseException { CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); Arrays diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index df0722905..c9cde57ce 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -113,6 +113,7 @@ public class OaiIterator implements Iterator { return downloadPage(url); } catch (final UnsupportedEncodingException e) { + errorLogList.add(e.getMessage()); throw new CollectorException(e); } } @@ -138,6 +139,7 @@ public class OaiIterator implements Iterator { + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8")); } catch (final UnsupportedEncodingException e) { + errorLogList.add(e.getMessage()); throw new CollectorException(e); } } @@ -150,12 +152,14 @@ public class OaiIterator implements Iterator { doc = reader.read(new StringReader(xml)); } catch (final DocumentException e) { log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); + errorLogList.add(e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { doc = reader.read(new StringReader(cleaned)); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { + errorLogList.add(e1.getMessage()); throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1); } return resumptionToken; @@ -166,10 +170,14 @@ public class OaiIterator implements Iterator { if (errorNode != null) { final String code = errorNode.valueOf("@code"); if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { - log.warn("noRecordsMatch for oai call: " + url); + final String msg = "noRecordsMatch for oai call : " + url; + log.warn(msg); + errorLogList.add(msg); return null; } else { - throw new CollectorException(code + " - " + errorNode.getText()); + final String msg = code + " - " + errorNode.getText(); + errorLogList.add(msg); + throw new CollectorException(msg); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index 7033cfd8e..f1d3aec9c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -29,16 +29,13 @@ public class CollectorWorker { private final String hdfsPath; - private CollectorPlugin plugin; - public CollectorWorker( final ApiDescriptor api, final String hdfsuri, - final String hdfsPath) throws CollectorException { + final String hdfsPath) { this.api = api; this.hdfsuri = hdfsuri; this.hdfsPath = hdfsPath; - this.plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol()); } public CollectorPluginErrorLogList collect() throws IOException, CollectorException { @@ -59,6 +56,7 @@ public class CollectorWorker { log.info("Created path " + hdfswritepath.toString()); + final CollectorPlugin plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol()); final AtomicInteger counter = new AtomicInteger(0); try (SequenceFile.Writer writer = SequenceFile .createWriter( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index d89bcee54..7ec830879 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -5,6 +5,9 @@ import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.application.ApplicationUtils.*; +import java.io.IOException; + +import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,7 +38,7 @@ public class CollectorWorkerApplication { /** * @param args */ - public static void main(final String[] args) throws Exception { + public static void main(final String[] args) throws ParseException, IOException, CollectorException { final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( IOUtils diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java index 84878bd1b..8e0f0ce4b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AbstractVocabularyTest.java @@ -1,52 +1,52 @@ + package eu.dnetlib.dhp.aggregation; +import static org.mockito.Mockito.lenient; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.mockito.Mock; + import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.transformation.TransformationFactory; import eu.dnetlib.dhp.transformation.TransformationJobTest; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import org.apache.commons.io.IOUtils; -import org.mockito.Mock; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; - -import static org.mockito.Mockito.lenient; public abstract class AbstractVocabularyTest { - @Mock - protected ISLookUpService isLookUpService; + @Mock + protected ISLookUpService isLookUpService; - protected VocabularyGroup vocabularies; + protected VocabularyGroup vocabularies; + public void setUpVocabulary() throws ISLookUpException, IOException { + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + } - public void setUpVocabulary() throws ISLookUpException, IOException { - lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + private static List vocs() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); + } - lenient() - .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) - .thenReturn(synonyms()); - vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); - } + private static List synonyms() throws IOException { + return IOUtils + .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); + } - private static List vocs() throws IOException { - return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/terms.txt")); - } + protected void mockupTrasformationRule(final String trule, final String path) throws Exception { + final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); - private static List synonyms() throws IOException { - return IOUtils - .readLines(TransformationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/transform/synonyms.txt")); - } - - protected void mockupTrasformationRule(final String trule, final String path) throws Exception { - final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); - - lenient() - .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) - .thenReturn(Collections.singletonList(trValue)); - } + lenient() + .when(isLookUpService.quickSearchProfile(String.format(TransformationFactory.TRULE_XQUERY, trule))) + .thenReturn(Collections.singletonList(trValue)); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java index 8f66b6233..3cb66d5ee 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -1,12 +1,19 @@ package eu.dnetlib.dhp.aggregation; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.TransformSparkJobNode; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -26,22 +33,17 @@ import org.mockito.junit.jupiter.MockitoExtension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; +import com.fasterxml.jackson.databind.ObjectMapper; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; -import static org.junit.jupiter.api.Assertions.assertEquals; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.TransformSparkJobNode; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @ExtendWith(MockitoExtension.class) -public class AggregationJobTest extends AbstractVocabularyTest{ +public class AggregationJobTest extends AbstractVocabularyTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -58,8 +60,6 @@ public class AggregationJobTest extends AbstractVocabularyTest{ private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); - - @BeforeAll public static void beforeAll() throws IOException { provenance = IOUtils @@ -86,8 +86,6 @@ public class AggregationJobTest extends AbstractVocabularyTest{ .getOrCreate(); } - - @AfterAll public static void afterAll() throws IOException { FileUtils.deleteDirectory(workingDir.toFile()); @@ -161,36 +159,42 @@ public class AggregationJobTest extends AbstractVocabularyTest{ MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json"); MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json"); - mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); final Map parameters = Stream.of(new String[][] { - { - "dateOfTransformation", "1234" - }, - { - "transformationPlugin", "XSLT_TRANSFORM" - }, - { - "transformationRuleId", "simpleTRule" - }, + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, }).collect(Collectors.toMap(data -> data[0], data -> data[1])); - TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdStoreV2.getHdfsPath()+MDSTORE_DATA_PATH, mdStoreCleanedVersion.getHdfsPath()); + TransformSparkJobNode + .transformRecords( + parameters, isLookUpService, spark, mdStoreV2.getHdfsPath() + MDSTORE_DATA_PATH, + mdStoreCleanedVersion.getHdfsPath()); final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mOutput = spark.read().format("parquet").load(mdStoreCleanedVersion.getHdfsPath()+MDSTORE_DATA_PATH).as(encoder); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdStoreCleanedVersion.getHdfsPath() + MDSTORE_DATA_PATH) + .as(encoder); final Long total = mOutput.count(); final long recordTs = mOutput - .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) - .count(); + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); final long recordNotEmpty = mOutput - .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) - .count(); + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); assertEquals(total, recordTs); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 648d7c8a1..9d6dacf0c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,12 +1,18 @@ package eu.dnetlib.dhp.transformation; -import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; -import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.collection.CollectionJobTest; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collections; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -21,17 +27,12 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.io.TempDir; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Collections; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.lenient; +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.collection.CollectionJobTest; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @ExtendWith(MockitoExtension.class) public class TransformationJobTest extends AbstractVocabularyTest { @@ -46,7 +47,6 @@ public class TransformationJobTest extends AbstractVocabularyTest { spark = SparkSession.builder().config(conf).getOrCreate(); } - @BeforeEach public void setUp() throws IOException, ISLookUpException { setUpVocabulary(); @@ -101,7 +101,11 @@ public class TransformationJobTest extends AbstractVocabularyTest { // TODO introduce useful assertions final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mOutput = spark.read().format("parquet").load(mdstore_output+MDSTORE_DATA_PATH).as(encoder); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); final Long total = mOutput.count(); @@ -131,13 +135,10 @@ public class TransformationJobTest extends AbstractVocabularyTest { Files.deleteIfExists(tempDirWithPrefix); } - - private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { final String trValue = IOUtils.toString(this.getClass().getResourceAsStream(path)); final LongAccumulator la = new LongAccumulator(); return new XSLTTransformationFunction(new AggregationCounter(la, la, la), trValue, 0, vocabularies); } - } From 72c57b28fa3ce38524a0caf830cbfa82ae50bd39 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 4 Feb 2021 14:08:18 +0100 Subject: [PATCH 34/86] switched project version to 1.2.4-branch_hadoop_aggregator-SNAPSHOT --- dhp-build/dhp-build-assembly-resources/pom.xml | 2 +- dhp-build/dhp-build-properties-maven-plugin/pom.xml | 2 +- dhp-build/dhp-code-style/pom.xml | 2 +- dhp-build/pom.xml | 2 +- dhp-common/pom.xml | 2 +- dhp-schemas/pom.xml | 2 +- dhp-workflows/dhp-actionmanager/pom.xml | 2 +- dhp-workflows/dhp-aggregation/pom.xml | 2 +- dhp-workflows/dhp-blacklist/pom.xml | 2 +- dhp-workflows/dhp-broker-events/pom.xml | 2 +- dhp-workflows/dhp-dedup-openaire/pom.xml | 2 +- dhp-workflows/dhp-dedup-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-distcp/pom.xml | 2 +- dhp-workflows/dhp-doiboost/pom.xml | 2 +- dhp-workflows/dhp-enrichment/pom.xml | 2 +- dhp-workflows/dhp-graph-mapper/pom.xml | 2 +- dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml | 2 +- dhp-workflows/dhp-graph-provision/pom.xml | 2 +- dhp-workflows/dhp-stats-promote/pom.xml | 2 +- dhp-workflows/dhp-stats-update/pom.xml | 2 +- dhp-workflows/dhp-usage-raw-data-update/pom.xml | 2 +- dhp-workflows/dhp-usage-stats-build/pom.xml | 2 +- dhp-workflows/dhp-workflow-profiles/pom.xml | 2 +- dhp-workflows/pom.xml | 2 +- pom.xml | 2 +- 25 files changed, 25 insertions(+), 25 deletions(-) diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 012ff89a3..6298a2e9c 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 256017e2c..882d668c9 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index 77aa2aedb..21170ff13 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp-code-style - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT jar diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index 12b999b9c..8ee3308cd 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index a8607a9b3..e2db8b451 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT ../ diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 73efeabb4..10ee5f9ff 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT ../pom.xml diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index 0b4d25700..55eece63d 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT dhp-actionmanager diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index b61c3d443..f0ee42542 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT dhp-aggregation diff --git a/dhp-workflows/dhp-blacklist/pom.xml b/dhp-workflows/dhp-blacklist/pom.xml index 9c25f7b29..6312e971a 100644 --- a/dhp-workflows/dhp-blacklist/pom.xml +++ b/dhp-workflows/dhp-blacklist/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 75cc0ea09..29e7c5295 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -5,7 +5,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index 03ddbcf4c..39a3c50fc 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 dhp-dedup-openaire diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml index aa4070b01..6c85a459d 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 8c10538c0..507cebca5 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-doiboost/pom.xml b/dhp-workflows/dhp-doiboost/pom.xml index 624dd7b31..b2c6fab53 100644 --- a/dhp-workflows/dhp-doiboost/pom.xml +++ b/dhp-workflows/dhp-doiboost/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index d0ab77cc5..47103156c 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 3e1d84c01..5e8448182 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml index b287e9c88..044850806 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 0d44d8e5e..db473bc0b 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-stats-promote/pom.xml b/dhp-workflows/dhp-stats-promote/pom.xml index c64c2f58e..f22c19047 100644 --- a/dhp-workflows/dhp-stats-promote/pom.xml +++ b/dhp-workflows/dhp-stats-promote/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 dhp-stats-promote diff --git a/dhp-workflows/dhp-stats-update/pom.xml b/dhp-workflows/dhp-stats-update/pom.xml index 52f35ff07..0fda05325 100644 --- a/dhp-workflows/dhp-stats-update/pom.xml +++ b/dhp-workflows/dhp-stats-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 dhp-stats-update diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index a78f92d41..3d01ad847 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 dhp-usage-raw-data-update diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index 20d2f5b76..bf580ed7f 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 dhp-usage-stats-build diff --git a/dhp-workflows/dhp-workflow-profiles/pom.xml b/dhp-workflows/dhp-workflow-profiles/pom.xml index 54e76c1e2..99a793612 100644 --- a/dhp-workflows/dhp-workflow-profiles/pom.xml +++ b/dhp-workflows/dhp-workflow-profiles/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 190c9847e..4131d79c0 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index cfe1edfbd..bef649c67 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.4-branch_hadoop_aggregator-SNAPSHOT pom From 4dae5e605dc48cf7ef5dd57b562d1f7b5938b329 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 4 Feb 2021 15:51:15 +0100 Subject: [PATCH 35/86] implemented messaging btween collection worker and Dnet --- .../java/eu/dnetlib/dhp/message/Message.java | 22 ++--- .../eu/dnetlib/dhp/message/MessageSender.java | 87 ++++++++++++------- .../collection/worker/CollectorWorker.java | 13 ++- .../worker/CollectorWorkerApplication.java | 14 ++- .../dhp/collection/collector_parameter.json | 10 ++- .../dhp/collection/oozie_app/workflow.xml | 8 ++ .../DnetCollectorWorkerApplicationTests.java | 11 --- 7 files changed, 105 insertions(+), 60 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index 57844d490..978af6dd8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -2,10 +2,15 @@ package eu.dnetlib.dhp.message; import java.io.Serializable; +import java.util.HashMap; import java.util.Map; public class Message implements Serializable { + public static String CURRENT_PARAM = "current"; + public static String TOTAL_PARAM = "total"; + + /** * */ @@ -13,17 +18,14 @@ public class Message implements Serializable { private String workflowId; - private String jobName; - private Map body; public Message() { + body = new HashMap<>(); } - public Message(final String workflowId, final String jobName, - final Map body) { + public Message(final String workflowId, final Map body) { this.workflowId = workflowId; - this.jobName = jobName; this.body = body; } @@ -35,14 +37,6 @@ public class Message implements Serializable { this.workflowId = workflowId; } - public String getJobName() { - return jobName; - } - - public void setJobName(final String jobName) { - this.jobName = jobName; - } - public Map getBody() { return body; } @@ -53,6 +47,6 @@ public class Message implements Serializable { @Override public String toString() { - return String.format("Message [workflowId=%s, jobName=%s, body=%s]", workflowId, jobName, body); + return String.format("Message [workflowId=%s, body=%s]", workflowId, body); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 70eb594f8..35ecaa50c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -10,48 +10,71 @@ import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; + public class MessageSender { - private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + private static final Logger log = LoggerFactory.getLogger(MessageSender.class); - private static final int SOCKET_TIMEOUT_MS = 2000; + private static final int SOCKET_TIMEOUT_MS = 2000; - private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; + private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; - private static final int CONNTECTION_TIMEOUT_MS = 2000; + private static final int CONNTECTION_TIMEOUT_MS = 2000; - private final String dnetMessageEndpoint; + private final String dnetMessageEndpoint; - public MessageSender(final String dnetMessageEndpoint) { - this.dnetMessageEndpoint = dnetMessageEndpoint; - } + private final String workflowId; - public void sendMessage(final Message message) { - new Thread(() -> _sendMessage(message)).start(); - } - private void _sendMessage(final Message message) { - final HttpPut req = new HttpPut(dnetMessageEndpoint); - req.setEntity(new SerializableEntity(message)); + public MessageSender(final String dnetMessageEndpoint, final String workflowId) { + this.workflowId = workflowId; + this.dnetMessageEndpoint = dnetMessageEndpoint; + } - final RequestConfig requestConfig = RequestConfig - .custom() - .setConnectTimeout(CONNTECTION_TIMEOUT_MS) - .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) - .setSocketTimeout(SOCKET_TIMEOUT_MS) - .build(); - ; + public void sendMessage(final Message message) { + new Thread(() -> _sendMessage(message)).start(); + } - try (final CloseableHttpClient client = HttpClients - .custom() - .setDefaultRequestConfig(requestConfig) - .build(); - final CloseableHttpResponse response = client.execute(req)) { - log.debug("Sent Message to " + dnetMessageEndpoint); - log.debug("MESSAGE:" + message); - } catch (final Throwable e) { - log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); - } - } + public void sendMessage(final Long current, final Long total) { + sendMessage(createMessage(current, total)); + } + + + private Message createMessage(final Long current, final Long total) { + + final Message m = new Message(); + m.setWorkflowId(workflowId); + m.getBody().put(Message.CURRENT_PARAM, current.toString()); + if (total != null) + m.getBody().put(Message.TOTAL_PARAM, total.toString()); + return m; + } + + + private void _sendMessage(final Message message) { + final HttpPut req = new HttpPut(dnetMessageEndpoint); + req.setEntity(new SerializableEntity(message)); + + final RequestConfig requestConfig = RequestConfig + .custom() + .setConnectTimeout(CONNTECTION_TIMEOUT_MS) + .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) + .setSocketTimeout(SOCKET_TIMEOUT_MS) + .build(); + ; + + try (final CloseableHttpClient client = HttpClients + .custom() + .setDefaultRequestConfig(requestConfig) + .build(); + final CloseableHttpResponse response = client.execute(req)) { + log.debug("Sent Message to " + dnetMessageEndpoint); + log.debug("MESSAGE:" + message); + } catch (final Throwable e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index f1d3aec9c..9d82a1ed4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -5,6 +5,8 @@ import java.io.IOException; import java.net.URI; import java.util.concurrent.atomic.AtomicInteger; +import eu.dnetlib.dhp.message.Message; +import eu.dnetlib.dhp.message.MessageSender; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -29,13 +31,18 @@ public class CollectorWorker { private final String hdfsPath; + private final MessageSender messageSender; + + public CollectorWorker( final ApiDescriptor api, final String hdfsuri, - final String hdfsPath) { + final String hdfsPath, + final MessageSender messageSender) { this.api = api; this.hdfsuri = hdfsuri; this.hdfsPath = hdfsPath; + this.messageSender = messageSender; } public CollectorPluginErrorLogList collect() throws IOException, CollectorException { @@ -58,6 +65,7 @@ public class CollectorWorker { final CollectorPlugin plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol()); final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile .createWriter( conf, @@ -71,6 +79,8 @@ public class CollectorWorker { .forEach( content -> { key.set(counter.getAndIncrement()); + if (counter.get()% 500 == 0) + messageSender.sendMessage(counter.longValue(), null); value.set(content); try { writer.append(key, value); @@ -79,6 +89,7 @@ public class CollectorWorker { } }); } finally { + messageSender.sendMessage(counter.longValue(),counter.longValue()); return plugin.getCollectionErrors(); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 7ec830879..d8e8a8e49 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -7,6 +7,8 @@ import static eu.dnetlib.dhp.application.ApplicationUtils.*; import java.io.IOException; +import eu.dnetlib.dhp.message.Message; +import eu.dnetlib.dhp.message.MessageSender; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; @@ -57,17 +59,27 @@ public class CollectorWorkerApplication { final String mdStoreVersion = argumentParser.get("mdStoreVersion"); log.info("mdStoreVersion is {}", mdStoreVersion); + final String dnetMessageManagerURL = argumentParser.get("dnetMessageManagerURL"); + log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); + + final String workflowId = argumentParser.get("workflowId"); + log.info("workflowId is {}", workflowId); + + final MessageSender ms = new MessageSender(dnetMessageManagerURL,workflowId); + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); final String hdfsPath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME; log.info("hdfs path is {}", hdfsPath); final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); - final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath); + final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath, ms); CollectorPluginErrorLogList errors = worker.collect(); populateOOZIEEnv(COLLECTOR_WORKER_ERRORS, errors.toString()); } + + } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json index 60e9762ff..6ccba468a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json @@ -17,10 +17,18 @@ "paramDescription": "the MDStore Version bean", "paramRequired": true }, + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", + "paramRequired": true + }, + + { "paramName": "w", "paramLongName": "workflowId", "paramDescription": "the identifier of the dnet Workflow", - "paramRequired": false + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 595613a2e..b74ef6b61 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -32,6 +32,11 @@ mdStoreManagerURI The URI of the MDStore Manager + + + dnetMessageManagerURL + The URI of the Dnet Message Manager + collectionMode Should be REFRESH or INCREMENTAL @@ -86,7 +91,10 @@ eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication --apidescriptor${apiDescription} --namenode${nameNode} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index 10964096c..9066e32b6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -36,17 +36,6 @@ public class DnetCollectorWorkerApplicationTests { assertNotNull(mapper.writeValueAsString(api)); } - @Test - public void testFeeding(@TempDir Path testDir) throws Exception { - - System.out.println(testDir.toString()); - CollectorWorker worker = new CollectorWorker(getApi(), - "file://" + testDir.toString() + "/file.seq", testDir.toString() + "/file.seq"); - worker.collect(); - - // TODO create ASSERT HERE - } - private ApiDescriptor getApi() { final ApiDescriptor api = new ApiDescriptor(); api.setId("oai"); From deb85706db53e4cd304ae323805e518d06570be2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 4 Feb 2021 17:24:52 +0100 Subject: [PATCH 36/86] imported HttpConnector from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java as HttpConnector2 --- .../dhp/model/mdstore/MetadataRecordTest.java | 16 - .../actionmanager/project/utils/ReadCSV.java | 4 +- .../project/utils/ReadExcel.java | 5 +- .../collection/plugin/oai/OaiIterator.java | 8 +- .../plugin/oai/OaiIteratorFactory.java | 8 +- .../worker/utils/HttpConnector.java | 8 +- .../worker/utils/HttpConnector2.java | 298 ++++++++++++++++++ .../project/EXCELParserTest.java | 4 +- .../httpconnector/HttpConnectorTest.java | 6 +- ...a => CollectorWorkerApplicationTests.java} | 2 +- 10 files changed, 316 insertions(+), 43 deletions(-) delete mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java rename dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/{DnetCollectorWorkerApplicationTests.java => CollectorWorkerApplicationTests.java} (97%) diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java deleted file mode 100644 index cb4d0ab50..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.model.mdstore; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Test; - -public class MetadataRecordTest { - - @Test - public void getTimestamp() { - - MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() > 0); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index ca1c10611..1b9e070fe 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -18,7 +18,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs @@ -74,7 +74,7 @@ public class ReadCSV implements Closeable { throws Exception { this.conf = new Configuration(); this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); + HttpConnector2 httpConnector = new HttpConnector2(); FileSystem fileSystem = FileSystem.get(this.conf); Path hdfsWritePath = new Path(hdfsPath); FSDataOutputStream fsDataOutputStream = null; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 585a408f3..2ad3f5b34 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -15,12 +15,11 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs */ - public class ReadExcel implements Closeable { private static final Log log = LogFactory.getLog(ReadCSV.class); private final Configuration conf; @@ -72,7 +71,7 @@ public class ReadExcel implements Closeable { throws Exception { this.conf = new Configuration(); this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); + HttpConnector2 httpConnector = new HttpConnector2(); FileSystem fileSystem = FileSystem.get(this.conf); Path hdfsWritePath = new Path(hdfsPath); FSDataOutputStream fsDataOutputStream = null; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index c9cde57ce..9c1ff0663 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -9,8 +9,6 @@ import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; @@ -20,7 +18,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; public class OaiIterator implements Iterator { @@ -37,7 +35,7 @@ public class OaiIterator implements Iterator { private final String untilDate; private String token; private boolean started; - private final HttpConnector httpConnector; + private final HttpConnector2 httpConnector; private CollectorPluginErrorLogList errorLogList; public OaiIterator( @@ -46,7 +44,7 @@ public class OaiIterator implements Iterator { final String set, final String fromDate, final String untilDate, - final HttpConnector httpConnector, + final HttpConnector2 httpConnector, final CollectorPluginErrorLogList errorLogList) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index eafd265d4..d6759580f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -4,11 +4,11 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; public class OaiIteratorFactory { - private HttpConnector httpConnector; + private HttpConnector2 httpConnector; public Iterator newIterator( final String baseUrl, @@ -20,9 +20,9 @@ public class OaiIteratorFactory { return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(), errorLogList); } - private HttpConnector getHttpConnector() { + private HttpConnector2 getHttpConnector() { if (httpConnector == null) - httpConnector = new HttpConnector(); + httpConnector = new HttpConnector2(); return httpConnector; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index fc45b4814..39c2371b9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -4,16 +4,9 @@ package eu.dnetlib.dhp.collection.worker.utils; import java.io.IOException; import java.io.InputStream; import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; import java.util.List; import java.util.Map; -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.io.IOUtils; import org.apache.commons.lang.math.NumberUtils; import org.slf4j.Logger; @@ -21,6 +14,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; +@Deprecated public class HttpConnector { private static final Logger log = LoggerFactory.getLogger(HttpConnector.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java new file mode 100644 index 000000000..b316f34ed --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java @@ -0,0 +1,298 @@ + +package eu.dnetlib.dhp.collection.worker.utils; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.*; +import java.security.GeneralSecurityException; +import java.security.cert.X509Certificate; +import java.util.List; +import java.util.Map; + +import javax.net.ssl.HttpsURLConnection; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.collection.worker.CollectorException; + +/** + * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java + * + * @author jochen, michele, andrea, alessia + */ +public class HttpConnector2 { + + private static final Log log = LogFactory.getLog(HttpConnector.class); + + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds + + private String responseType = null; + + private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + public HttpConnector2() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList) + */ + public String getInputSource(final String requestUrl) throws CollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList) + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { + return IOUtils.toInputStream(getInputSource(requestUrl)); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @param errorLogList the list of errors + * @return the content of the downloaded resource + * @throws CollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList) + throws CollectorException { + return attemptDownlaodAsString(requestUrl, 1, errorLogList); + } + + private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, + final CollectorPluginErrorLogList errorList) + throws CollectorException { + try { + InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (InterruptedException e) { + throw new CollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, + final CollectorPluginErrorLogList errorList) + throws CollectorException { + + if (retryNumber > maxNumberOfRetry) { + throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList); + } + + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; + + try { + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (is2xx(urlConn.getResponseCode())) { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + if (is3xx(urlConn.getResponseCode())) { + // REDIRECTS + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl)); + errorList + .add( + String + .format( + "%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(), + urlConn.getResponseMessage(), newUrl)); + urlConn.disconnect(); + if (retryAfter > 0) + Thread.sleep(retryAfter * 1000); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } + if (is4xx(urlConn.getResponseCode())) { + // CLIENT ERROR, DO NOT RETRY + errorList + .add( + String + .format( + "%s error %s: %s", requestUrl, urlConn.getResponseCode(), + urlConn.getResponseMessage())); + throw new CollectorException("4xx error: request will not be repeated. " + errorList); + } + if (is5xx(urlConn.getResponseCode())) { + // SERVER SIDE ERRORS RETRY ONLY on 503 + switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_UNAVAILABLE: + if (retryAfter > 0) { + log + .warn( + requestUrl + " - waiting and repeating request after suggested retry-after " + + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + } else { + log + .warn( + requestUrl + " - waiting and repeating request after default delay of " + + defaultDelay + " sec."); + Thread.sleep(defaultDelay * 1000); + } + errorList.add(requestUrl + " 503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + default: + errorList + .add( + String + .format( + "%s Error %s: %s", requestUrl, urlConn.getResponseCode(), + urlConn.getResponseMessage())); + throw new CollectorException(urlConn.getResponseCode() + " error " + errorList); + } + } + throw new CollectorException( + String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), errorList)); + } catch (MalformedURLException | NoRouteToHostException e) { + errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl)); + throw new CollectorException(e + "error " + errorList); + } catch (IOException e) { + Thread.sleep(defaultDelay * 1000); + errorList.add(requestUrl + " " + e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (InterruptedException e) { + throw new CollectorException(e); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); + + for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (String key : headerMap.keySet()) { + if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) + && NumberUtils.isCreatable(headerMap.get(key).get(0))) { + return Integer + .parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws CollectorException { + for (String key : headerMap.keySet()) { + if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { + return headerMap.get(key).get(0); + } + } + throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + /** + * register for https scheme; this is a workaround and not intended for the use in trusted environments + */ + public void initTrustManager() { + final X509TrustManager tm = new X509TrustManager() { + + @Override + public void checkClientTrusted(final X509Certificate[] xcs, final String string) { + } + + @Override + public void checkServerTrusted(final X509Certificate[] xcs, final String string) { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] { + tm + }, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } + + private boolean is2xx(final int statusCode) { + return statusCode >= 200 && statusCode <= 299; + } + + private boolean is4xx(final int statusCode) { + return statusCode >= 400 && statusCode <= 499; + } + + private boolean is3xx(final int statusCode) { + return statusCode >= 300 && statusCode <= 399; + } + + private boolean is5xx(final int statusCode) { + return statusCode >= 500 && statusCode <= 599; + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getDefaultDelay() { + return defaultDelay; + } + + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } + + public String getResponseType() { + return responseType; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 5c37e9ec3..e6fda9be0 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -14,13 +14,13 @@ import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; @Disabled public class EXCELParserTest { private static Path workingDir; - private HttpConnector httpConnector = new HttpConnector(); + private HttpConnector2 httpConnector = new HttpConnector2(); private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; @BeforeAll diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java index f5ef280a0..103e11c33 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java @@ -10,13 +10,13 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; @Disabled public class HttpConnectorTest { private static final Log log = LogFactory.getLog(HttpConnectorTest.class); - private static HttpConnector connector; + private static HttpConnector2 connector; private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; private static final String URL_MISCONFIGURED_SERVER = "https://www.alexandria.unisg.ch/cgi/oai2?verb=Identify"; @@ -27,7 +27,7 @@ public class HttpConnectorTest { @BeforeAll public static void setUp() { - connector = new HttpConnector(); + connector = new HttpConnector2(); } @Test diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java similarity index 97% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java index 10964096c..ff9ad8c85 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -16,7 +16,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @Disabled -public class DnetCollectorWorkerApplicationTests { +public class CollectorWorkerApplicationTests { @Test public void testFindPlugin() throws Exception { From 2ee0c3e47e4b17a7902de5b4f90e215c83bdd763 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Fri, 5 Feb 2021 09:45:39 +0100 Subject: [PATCH 37/86] http entity as json string --- .../java/eu/dnetlib/dhp/message/Message.java | 1 - .../eu/dnetlib/dhp/message/MessageSender.java | 105 ++++++++++-------- 2 files changed, 56 insertions(+), 50 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index 978af6dd8..ed2a3c9b3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -10,7 +10,6 @@ public class Message implements Serializable { public static String CURRENT_PARAM = "current"; public static String TOTAL_PARAM = "total"; - /** * */ diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 35ecaa50c..3f9d07a7e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -4,77 +4,84 @@ package eu.dnetlib.dhp.message; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPut; -import org.apache.http.entity.SerializableEntity; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Function; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; public class MessageSender { - private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + private static final Logger log = LoggerFactory.getLogger(MessageSender.class); - private static final int SOCKET_TIMEOUT_MS = 2000; + private static final int SOCKET_TIMEOUT_MS = 2000; - private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; + private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; - private static final int CONNTECTION_TIMEOUT_MS = 2000; + private static final int CONNTECTION_TIMEOUT_MS = 2000; - private final String dnetMessageEndpoint; + private final ObjectMapper objectMapper = new ObjectMapper(); - private final String workflowId; + private final String dnetMessageEndpoint; + private final String workflowId; - public MessageSender(final String dnetMessageEndpoint, final String workflowId) { - this.workflowId = workflowId; - this.dnetMessageEndpoint = dnetMessageEndpoint; - } + public MessageSender(final String dnetMessageEndpoint, final String workflowId) { + this.workflowId = workflowId; + this.dnetMessageEndpoint = dnetMessageEndpoint; + } - public void sendMessage(final Message message) { - new Thread(() -> _sendMessage(message)).start(); - } + public void sendMessage(final Message message) { + new Thread(() -> _sendMessage(message)).start(); + } - public void sendMessage(final Long current, final Long total) { - sendMessage(createMessage(current, total)); - } + public void sendMessage(final Long current, final Long total) { + sendMessage(createMessage(current, total)); + } + private Message createMessage(final Long current, final Long total) { - private Message createMessage(final Long current, final Long total) { + final Message m = new Message(); + m.setWorkflowId(workflowId); + m.getBody().put(Message.CURRENT_PARAM, current.toString()); + if (total != null) { + m.getBody().put(Message.TOTAL_PARAM, total.toString()); + } + return m; + } - final Message m = new Message(); - m.setWorkflowId(workflowId); - m.getBody().put(Message.CURRENT_PARAM, current.toString()); - if (total != null) - m.getBody().put(Message.TOTAL_PARAM, total.toString()); - return m; - } + private void _sendMessage(final Message message) { + try { + final String json = objectMapper.writeValueAsString(message); + final HttpPut req = new HttpPut(dnetMessageEndpoint); + req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON)); - private void _sendMessage(final Message message) { - final HttpPut req = new HttpPut(dnetMessageEndpoint); - req.setEntity(new SerializableEntity(message)); + final RequestConfig requestConfig = RequestConfig + .custom() + .setConnectTimeout(CONNTECTION_TIMEOUT_MS) + .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) + .setSocketTimeout(SOCKET_TIMEOUT_MS) + .build(); + ; - final RequestConfig requestConfig = RequestConfig - .custom() - .setConnectTimeout(CONNTECTION_TIMEOUT_MS) - .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) - .setSocketTimeout(SOCKET_TIMEOUT_MS) - .build(); - ; - - try (final CloseableHttpClient client = HttpClients - .custom() - .setDefaultRequestConfig(requestConfig) - .build(); - final CloseableHttpResponse response = client.execute(req)) { - log.debug("Sent Message to " + dnetMessageEndpoint); - log.debug("MESSAGE:" + message); - } catch (final Throwable e) { - log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); - } - } + try (final CloseableHttpClient client = HttpClients + .custom() + .setDefaultRequestConfig(requestConfig) + .build(); + final CloseableHttpResponse response = client.execute(req)) { + log.debug("Sent Message to " + dnetMessageEndpoint); + log.debug("MESSAGE:" + message); + } catch (final Throwable e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } catch (final JsonProcessingException e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } } From a8a758925e236d70d9fb4ccb2404ef95cd0f9370 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Feb 2021 19:18:05 +0100 Subject: [PATCH 38/86] better logging, WIP: collectorWorker error reporting --- .../dhp/application/ApplicationUtils.java | 35 +- .../common/AggregationConstants.java | 1 + .../common/AggregationUtility.java | 28 +- .../GenerateNativeStoreSparkJob.java | 8 +- .../collection/plugin/CollectorPlugin.java | 5 +- .../plugin/oai/OaiCollectorPlugin.java | 32 +- .../collection/plugin/oai/OaiIterator.java | 26 +- .../plugin/oai/OaiIteratorFactory.java | 13 +- .../collection/worker/CollectorWorker.java | 76 +++-- .../worker/CollectorWorkerApplication.java | 80 ++++- .../worker/CollectorWorkerReporter.java | 69 ++++ .../utils/CollectorPluginErrorLogList.java | 19 -- .../worker/utils/CollectorPluginFactory.java | 9 +- .../worker/utils/CollectorPluginReport.java | 64 ++++ .../worker/utils/HttpClientParams.java | 62 ++++ .../worker/utils/HttpConnector.java | 218 ------------ .../worker/utils/HttpConnector2.java | 319 +++++++----------- .../UnknownCollectorPluginException.java | 32 ++ .../transformation/TransformSparkJobNode.java | 3 +- .../collector_reporter_input_parameter.json | 14 + ... => collector_worker_input_parameter.json} | 26 +- ... => generate_native_input_parameters.json} | 0 .../dhp/collection/oozie_app/workflow.xml | 15 +- .../httpconnector/HttpConnectorTest.java | 44 --- .../CollectorWorkerApplicationTests.java | 6 +- .../utils/CollectorPluginReportTest.java | 29 ++ 26 files changed, 650 insertions(+), 583 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json rename dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/{collector_parameter.json => collector_worker_input_parameter.json} (52%) rename dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/{collection_input_parameters.json => generate_native_input_parameters.json} (100%) delete mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java index 72c41a062..c78fb1b1f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java @@ -2,18 +2,43 @@ package eu.dnetlib.dhp.application; import java.io.*; +import java.util.Map; import java.util.Properties; +import org.apache.hadoop.conf.Configuration; + +import com.google.common.collect.Maps; + public class ApplicationUtils { - public static void populateOOZIEEnv(final String paramName, String value) throws IOException { + public static Configuration getHadoopConfiguration(String nameNode) { + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", nameNode); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + return conf; + } + + public static void populateOOZIEEnv(final Map report) throws IOException { File file = new File(System.getProperty("oozie.action.output.properties")); Properties props = new Properties(); + report.forEach((k, v) -> props.setProperty(k, v)); - props.setProperty(paramName, value); - OutputStream os = new FileOutputStream(file); - props.store(os, ""); - os.close(); + try(OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + } + + public static void populateOOZIEEnv(final String paramName, String value) throws IOException { + Map report = Maps.newHashMap(); + report.put(paramName, value); + + populateOOZIEEnv(report); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java index 7c5ad354d..8e0b7260d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.aggregation.common; public class AggregationConstants { public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String REPORT_FILE_NAME = "/report"; public static final String MDSTORE_DATA_PATH = "/store"; public static final String MDSTORE_SIZE_PATH = "/size"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java index 7332ac071..8dad5bb81 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java @@ -2,9 +2,14 @@ package eu.dnetlib.dhp.aggregation.common; import java.io.BufferedOutputStream; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -25,16 +30,31 @@ public class AggregationUtility { public static final ObjectMapper MAPPER = new ObjectMapper(); - public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path) + public static void writeHdfsFile(final Configuration conf, final String content, final String path) throws IOException { - log.info("writing size ({}) info file {}", total, path); - try (FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + log.info("writing file {}, size {}", path, content.length()); + try (FileSystem fs = FileSystem.get(conf); BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { - os.write(total.toString().getBytes(StandardCharsets.UTF_8)); + os.write(content.getBytes(StandardCharsets.UTF_8)); os.flush(); } + } + public static String readHdfsFile(Configuration conf, String path) throws IOException { + log.info("reading file {}", path); + + try (FileSystem fs = FileSystem.get(conf)) { + final Path p = new Path(path); + if (!fs.exists(p)) { + throw new FileNotFoundException(path); + } + return IOUtils.toString(fs.open(p)); + } + } + + public static T readHdfsFileAs(Configuration conf, String path, Class clazz) throws IOException { + return MAPPER.readValue(readHdfsFile(conf, path), clazz); } public static void saveDataset(final Dataset mdstore, final String targetPath) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index fdf3965d6..5839df2d0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -28,8 +28,6 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; @@ -47,7 +45,7 @@ public class GenerateNativeStoreSparkJob { .toString( GenerateNativeStoreSparkJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + "/eu/dnetlib/dhp/collection/generate_native_input_parameters.json"))); parser.parseArgument(args); final String provenanceArgument = parser.get("provenance"); @@ -148,7 +146,9 @@ public class GenerateNativeStoreSparkJob { final Long total = spark.read().load(targetPath).count(); log.info("collected {} records for datasource '{}'", total, provenance.getDatasourceName()); - writeTotalSizeOnHDFS(spark, total, currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH); + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), total.toString(), + currentVersion.getHdfsPath() + MDSTORE_SIZE_PATH); } public static class MDStoreAggregator extends Aggregator { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index a0c546858..d0905aade 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -4,12 +4,11 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws CollectorException; + Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException; - CollectorPluginErrorLogList getCollectionErrors(); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index ea74919c5..29e12f312 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -9,15 +9,14 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import org.jetbrains.annotations.NotNull; - import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class OaiCollectorPlugin implements CollectorPlugin { @@ -29,19 +28,15 @@ public class OaiCollectorPlugin implements CollectorPlugin { private OaiIteratorFactory oaiIteratorFactory; - private final CollectorPluginErrorLogList errorLogList = new CollectorPluginErrorLogList(); + private HttpClientParams clientParams; - @Override - public Stream collect(final ApiDescriptor api) throws CollectorException { - try { - return doCollect(api); - } catch (CollectorException e) { - errorLogList.add(e.getMessage()); - throw e; - } + public OaiCollectorPlugin(HttpClientParams clientParams) { + this.clientParams = clientParams; } - private Stream doCollect(ApiDescriptor api) throws CollectorException { + @Override + public Stream collect(final ApiDescriptor api, final CollectorPluginReport report) + throws CollectorException { final String baseUrl = api.getBaseUrl(); final String mdFormat = api.getParams().get(FORMAT_PARAM); final String setParam = api.getParams().get(OAI_SET_PARAM); @@ -79,7 +74,7 @@ public class OaiCollectorPlugin implements CollectorPlugin { .stream() .map( set -> getOaiIteratorFactory() - .newIterator(baseUrl, mdFormat, set, fromDate, untilDate, errorLogList)) + .newIterator(baseUrl, mdFormat, set, fromDate, untilDate, getClientParams(), report)) .iterator(); return StreamSupport @@ -94,8 +89,11 @@ public class OaiCollectorPlugin implements CollectorPlugin { return oaiIteratorFactory; } - @Override - public CollectorPluginErrorLogList getCollectionErrors() { - return errorLogList; + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 9c1ff0663..667e7a3d3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -17,7 +17,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; @@ -25,6 +25,8 @@ public class OaiIterator implements Iterator { private static final Logger log = LoggerFactory.getLogger(OaiIterator.class); + private final static String REPORT_PREFIX = "oai:"; + private final Queue queue = new PriorityBlockingQueue<>(); private final SAXReader reader = new SAXReader(); @@ -36,7 +38,7 @@ public class OaiIterator implements Iterator { private String token; private boolean started; private final HttpConnector2 httpConnector; - private CollectorPluginErrorLogList errorLogList; + private CollectorPluginReport errorLogList; public OaiIterator( final String baseUrl, @@ -45,7 +47,7 @@ public class OaiIterator implements Iterator { final String fromDate, final String untilDate, final HttpConnector2 httpConnector, - final CollectorPluginErrorLogList errorLogList) { + final CollectorPluginReport errorLogList) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; this.set = set; @@ -111,7 +113,7 @@ public class OaiIterator implements Iterator { return downloadPage(url); } catch (final UnsupportedEncodingException e) { - errorLogList.add(e.getMessage()); + errorLogList.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e); } } @@ -137,7 +139,7 @@ public class OaiIterator implements Iterator { + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8")); } catch (final UnsupportedEncodingException e) { - errorLogList.add(e.getMessage()); + errorLogList.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e); } } @@ -150,14 +152,14 @@ public class OaiIterator implements Iterator { doc = reader.read(new StringReader(xml)); } catch (final DocumentException e) { log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); - errorLogList.add(e.getMessage()); + errorLogList.put(e.getClass().getName(), e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { doc = reader.read(new StringReader(cleaned)); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { - errorLogList.add(e1.getMessage()); + errorLogList.put(e1.getClass().getName(), e1.getMessage()); throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1); } return resumptionToken; @@ -166,15 +168,15 @@ public class OaiIterator implements Iterator { final Node errorNode = doc.selectSingleNode("/*[local-name()='OAI-PMH']/*[local-name()='error']"); if (errorNode != null) { - final String code = errorNode.valueOf("@code"); - if ("noRecordsMatch".equalsIgnoreCase(code.trim())) { + final String code = errorNode.valueOf("@code").trim(); + if ("noRecordsMatch".equalsIgnoreCase(code)) { final String msg = "noRecordsMatch for oai call : " + url; log.warn(msg); - errorLogList.add(msg); + errorLogList.put(REPORT_PREFIX + code, msg); return null; } else { final String msg = code + " - " + errorNode.getText(); - errorLogList.add(msg); + errorLogList.put(REPORT_PREFIX + "error", msg); throw new CollectorException(msg); } } @@ -186,7 +188,7 @@ public class OaiIterator implements Iterator { return doc.valueOf("//*[local-name()='resumptionToken']"); } - public CollectorPluginErrorLogList getErrorLogList() { + public CollectorPluginReport getErrorLogList() { return errorLogList; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index d6759580f..c751a94e7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,7 +3,8 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; public class OaiIteratorFactory { @@ -16,13 +17,15 @@ public class OaiIteratorFactory { final String set, final String fromDate, final String untilDate, - final CollectorPluginErrorLogList errorLogList) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(), errorLogList); + final HttpClientParams clientParams, + final CollectorPluginReport errorLogList) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), + errorLogList); } - private HttpConnector2 getHttpConnector() { + private HttpConnector2 getHttpConnector(HttpClientParams clientParams) { if (httpConnector == null) - httpConnector = new HttpConnector2(); + httpConnector = new HttpConnector2(clientParams); return httpConnector; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index 9d82a1ed4..b0efd088c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -1,14 +1,13 @@ package eu.dnetlib.dhp.collection.worker; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.SEQUENCE_FILE_NAME; +import static eu.dnetlib.dhp.application.ApplicationUtils.*; + import java.io.IOException; -import java.net.URI; import java.util.concurrent.atomic.AtomicInteger; -import eu.dnetlib.dhp.message.Message; -import eu.dnetlib.dhp.message.MessageSender; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; @@ -16,10 +15,14 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; +import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.message.MessageSender; public class CollectorWorker { @@ -27,70 +30,71 @@ public class CollectorWorker { private final ApiDescriptor api; - private final String hdfsuri; + private final Configuration conf; - private final String hdfsPath; + private final MDStoreVersion mdStoreVersion; + + private final HttpClientParams clientParams; + + private final CollectorPluginReport report; private final MessageSender messageSender; - public CollectorWorker( final ApiDescriptor api, - final String hdfsuri, - final String hdfsPath, - final MessageSender messageSender) { + final Configuration conf, + final MDStoreVersion mdStoreVersion, + final HttpClientParams clientParams, + final MessageSender messageSender, + final CollectorPluginReport report) { this.api = api; - this.hdfsuri = hdfsuri; - this.hdfsPath = hdfsPath; + this.conf = conf; + this.mdStoreVersion = mdStoreVersion; + this.clientParams = clientParams; this.messageSender = messageSender; + this.report = report; } - public CollectorPluginErrorLogList collect() throws IOException, CollectorException { + public void collect() throws UnknownCollectorPluginException, CollectorException, IOException { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME; + log.info("outputPath path is {}", outputPath); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(hdfsPath); - - log.info("Created path " + hdfswritepath.toString()); - - final CollectorPlugin plugin = CollectorPluginFactory.getPluginByProtocol(api.getProtocol()); + final CollectorPlugin plugin = CollectorPluginFactory.getPluginByProtocol(clientParams, api.getProtocol()); final AtomicInteger counter = new AtomicInteger(0); try (SequenceFile.Writer writer = SequenceFile .createWriter( conf, - SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { final IntWritable key = new IntWritable(counter.get()); final Text value = new Text(); plugin - .collect(api) + .collect(api, report) .forEach( content -> { key.set(counter.getAndIncrement()); - if (counter.get()% 500 == 0) + if (counter.get() % 500 == 0) messageSender.sendMessage(counter.longValue(), null); value.set(content); try { writer.append(key, value); - } catch (IOException e) { + } catch (Throwable e) { + report.put(e.getClass().getName(), e.getMessage()); + log.warn("setting report to failed"); + report.setSuccess(false); throw new RuntimeException(e); } }); + } catch (Throwable e) { + report.put(e.getClass().getName(), e.getMessage()); + log.warn("setting report to failed"); + report.setSuccess(false); } finally { - messageSender.sendMessage(counter.longValue(),counter.longValue()); - return plugin.getCollectionErrors(); + messageSender.sendMessage(counter.longValue(), counter.longValue()); } } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index d8e8a8e49..8f26074c3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -6,22 +6,27 @@ import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.application.ApplicationUtils.*; import java.io.IOException; +import java.util.Optional; -import eu.dnetlib.dhp.message.Message; -import eu.dnetlib.dhp.message.MessageSender; import org.apache.commons.cli.ParseException; +import org.apache.commons.io.FileSystemUtils; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; +import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.message.MessageSender; /** * CollectorWorkerApplication is the main class responsible to start the metadata collection process, storing the outcomes @@ -35,19 +40,18 @@ public class CollectorWorkerApplication { private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); - public static final String COLLECTOR_WORKER_ERRORS = "collectorWorker-errors"; - /** * @param args */ - public static void main(final String[] args) throws ParseException, IOException, CollectorException { + public static void main(final String[] args) + throws ParseException, IOException, UnknownCollectorPluginException, CollectorException { final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( IOUtils .toString( CollectorWorker.class .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collector_parameter.json"))); + "/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json"))); argumentParser.parseArgument(args); final String hdfsuri = argumentParser.get("namenode"); @@ -65,21 +69,61 @@ public class CollectorWorkerApplication { final String workflowId = argumentParser.get("workflowId"); log.info("workflowId is {}", workflowId); - final MessageSender ms = new MessageSender(dnetMessageManagerURL,workflowId); + final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId); final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); - final String hdfsPath = currentVersion.getHdfsPath() + SEQUENCE_FILE_NAME; - log.info("hdfs path is {}", hdfsPath); + + final String reportPath = currentVersion.getHdfsPath() + REPORT_FILE_NAME; + log.info("report path is {}", reportPath); + + final HttpClientParams clientParams = getClientParams(argumentParser); final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); + final Configuration conf = getHadoopConfiguration(hdfsuri); - final CollectorWorker worker = new CollectorWorker(api, hdfsuri, hdfsPath, ms); - CollectorPluginErrorLogList errors = worker.collect(); - - populateOOZIEEnv(COLLECTOR_WORKER_ERRORS, errors.toString()); - + try (CollectorPluginReport report = new CollectorPluginReport(FileSystem.get(conf), new Path(reportPath))) { + final CollectorWorker worker = new CollectorWorker(api, conf, currentVersion, clientParams, ms, report); + worker.collect(); + report.setSuccess(true); + } catch (Throwable e) { + log.info("got exception {}, ignoring", e.getMessage()); + } } + private static HttpClientParams getClientParams(ArgumentApplicationParser argumentParser) { + final HttpClientParams clientParams = new HttpClientParams(); + clientParams + .setMaxNumberOfRetry( + Optional + .ofNullable(argumentParser.get("maxNumberOfRetry")) + .map(Integer::parseInt) + .orElse(HttpClientParams._maxNumberOfRetry)); + log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry()); + clientParams + .setRetryDelay( + Optional + .ofNullable(argumentParser.get("retryDelay")) + .map(Integer::parseInt) + .orElse(HttpClientParams._retryDelay)); + log.info("retryDelay is {}", clientParams.getRetryDelay()); + + clientParams + .setConnectTimeOut( + Optional + .ofNullable(argumentParser.get("connectTimeOut")) + .map(Integer::parseInt) + .orElse(HttpClientParams._connectTimeOut)); + log.info("connectTimeOut is {}", clientParams.getConnectTimeOut()); + + clientParams + .setReadTimeOut( + Optional + .ofNullable(argumentParser.get("readTimeOut")) + .map(Integer::parseInt) + .orElse(HttpClientParams._readTimeOut)); + log.info("readTimeOut is {}", clientParams.getReadTimeOut()); + return clientParams; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java new file mode 100644 index 000000000..e0e402cfb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java @@ -0,0 +1,69 @@ + +package eu.dnetlib.dhp.collection.worker; + +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.REPORT_FILE_NAME; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.MAPPER; +import static eu.dnetlib.dhp.application.ApplicationUtils.getHadoopConfiguration; +import static eu.dnetlib.dhp.application.ApplicationUtils.populateOOZIEEnv; + +import java.io.IOException; +import java.util.Objects; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregationUtility; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; + +/** + * CollectorWorkerReporter + */ +public class CollectorWorkerReporter { + + private static final Logger log = LoggerFactory.getLogger(CollectorWorkerReporter.class); + + /** + * @param args + */ + public static void main(final String[] args) throws IOException, ParseException, CollectorException { + + final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + CollectorWorker.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json"))); + argumentParser.parseArgument(args); + + final String nameNode = argumentParser.get("namenode"); + log.info("nameNode is {}", nameNode); + + final String mdStoreVersion = argumentParser.get("mdStoreVersion"); + log.info("mdStoreVersion is {}", mdStoreVersion); + + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); + + final String reportPath = currentVersion.getHdfsPath() + REPORT_FILE_NAME; + log.info("report path is {}", reportPath); + + final Configuration conf = getHadoopConfiguration(nameNode); + CollectorPluginReport report = readHdfsFileAs(conf, reportPath, CollectorPluginReport.class); + if (Objects.isNull(report)) { + throw new CollectorException("collection report is NULL"); + } + log.info("report success: {}, size: {}", report.isSuccess(), report.size()); + report.forEach((k, v) -> log.info("{} - {}", k, v)); + if (!report.isSuccess()) { + throw new CollectorException("collection report indicates a failure"); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java deleted file mode 100644 index dcaf0ea56..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,19 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = ""; - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index 7cbcd9b5c..ab7dad077 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -7,14 +7,15 @@ import eu.dnetlib.dhp.collection.worker.CollectorException; public class CollectorPluginFactory { - public static CollectorPlugin getPluginByProtocol(final String protocol) throws CollectorException { + public static CollectorPlugin getPluginByProtocol(final HttpClientParams clientParams, final String protocol) + throws UnknownCollectorPluginException { if (protocol == null) - throw new CollectorException("protocol cannot be null"); + throw new UnknownCollectorPluginException("protocol cannot be null"); switch (protocol.toLowerCase().trim()) { case "oai": - return new OaiCollectorPlugin(); + return new OaiCollectorPlugin(clientParams); default: - throw new CollectorException("UNknown protocol"); + throw new UnknownCollectorPluginException("Unknown protocol"); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java new file mode 100644 index 000000000..b7bf539dc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java @@ -0,0 +1,64 @@ + +package eu.dnetlib.dhp.collection.worker.utils; + +import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.MAPPER; + +import java.io.Closeable; +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Objects; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.annotation.JsonIgnore; + +import eu.dnetlib.dhp.application.ApplicationUtils; + +public class CollectorPluginReport extends LinkedHashMap implements Closeable { + + private static final Logger log = LoggerFactory.getLogger(CollectorPluginReport.class); + + @JsonIgnore + private FileSystem fs; + + @JsonIgnore + private Path path; + + @JsonIgnore + private FSDataOutputStream fos; + + public static String SUCCESS = "success"; + + public CollectorPluginReport() { + } + + public CollectorPluginReport(FileSystem fs, Path path) throws IOException { + this.fs = fs; + this.path = path; + + this.fos = fs.create(path); + } + + public Boolean isSuccess() { + return Boolean.valueOf(get(SUCCESS)); + } + + public void setSuccess(Boolean success) { + put(SUCCESS, String.valueOf(success)); + } + + @Override + public void close() throws IOException { + final String data = MAPPER.writeValueAsString(this); + if (Objects.nonNull(fos)) { + log.info("writing report {} to {}", data, path.toString()); + IOUtils.write(data, fos); + ApplicationUtils.populateOOZIEEnv(this); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java new file mode 100644 index 000000000..e77f3680f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java @@ -0,0 +1,62 @@ + +package eu.dnetlib.dhp.collection.worker.utils; + +/** + * Bundles the http connection parameters driving the client behaviour. + */ +public class HttpClientParams { + + public static int _maxNumberOfRetry = 3; + public static int _retryDelay = 10; // seconds + public static int _connectTimeOut = 10; // seconds + public static int _readTimeOut = 30; // seconds + + private int maxNumberOfRetry; + private int retryDelay; + private int connectTimeOut; + private int readTimeOut; + + public HttpClientParams() { + this(_maxNumberOfRetry, _retryDelay, _connectTimeOut, _readTimeOut); + } + + public HttpClientParams(int maxNumberOfRetry, int retryDelay, int connectTimeOut, int readTimeOut) { + this.maxNumberOfRetry = maxNumberOfRetry; + this.retryDelay = retryDelay; + this.connectTimeOut = connectTimeOut; + this.readTimeOut = readTimeOut; + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getRetryDelay() { + return retryDelay; + } + + public void setRetryDelay(int retryDelay) { + this.retryDelay = retryDelay; + } + + public void setConnectTimeOut(int connectTimeOut) { + this.connectTimeOut = connectTimeOut; + } + + public int getConnectTimeOut() { + return connectTimeOut; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(int readTimeOut) { + this.readTimeOut = readTimeOut; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java deleted file mode 100644 index 39c2371b9..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ /dev/null @@ -1,218 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.util.List; -import java.util.Map; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.math.NumberUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.collection.worker.CollectorException; - -@Deprecated -public class HttpConnector { - - private static final Logger log = LoggerFactory.getLogger(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws CollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws CollectorException { - return attemptDownloadAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @param errorLogList the list of errors - * @return the content of the downloaded resource - * @throws CollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList) - throws CollectorException { - return attemptDownloadAsString(requestUrl, 1, errorLogList); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws CollectorException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownloadAsString( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws CollectorException { - - log.info("requesting URL [{}]", requestUrl); - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occurred: {}", requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownloadAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new CollectorException(e); - } - } - - private InputStream attemptDownload( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws CollectorException { - - if (retryNumber > maxNumberOfRetry) { - throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("requesting URL [{}], try {}", requestUrl, retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after {} sec.", retryAfter); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM - || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to {}", newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", - urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - final String msg = String - .format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage()); - log.error(msg); - Thread.sleep(defaultDelay * 1000); - errorList.add(msg); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occurred: {}", requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new CollectorException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: {}", urlConn.getResponseMessage()); - - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: {} value: {}", e.getKey(), v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null - && key.toLowerCase().equals("retry-after") - && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { - return Integer.parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) - throws CollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { - return headerMap.get(key).get(0); - } - } - throw new CollectorException( - "The requested url has been MOVED, but 'location' param is MISSING"); - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java index b316f34ed..68b1ef8ad 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java @@ -1,24 +1,17 @@ package eu.dnetlib.dhp.collection.worker.utils; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; import java.util.List; import java.util.Map; -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.apache.http.HttpHeaders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; @@ -29,162 +22,151 @@ import eu.dnetlib.dhp.collection.worker.CollectorException; */ public class HttpConnector2 { - private static final Log log = LogFactory.getLog(HttpConnector.class); + private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class); - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds + private static final String REPORT_PREFIX = "http:"; + + private HttpClientParams clientParams; private String responseType = null; private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; public HttpConnector2() { + this(new HttpClientParams()); + } + + public HttpConnector2(HttpClientParams clientParams) { + this.clientParams = clientParams; CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); } /** - * @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList) - */ - public String getInputSource(final String requestUrl) throws CollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * @see HttpConnector2#getInputSource(java.lang.String, eu.dnetlib.dhp.collection.worker.utils.CollectorPluginErrorLogList) + * @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport) */ public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { return IOUtils.toInputStream(getInputSource(requestUrl)); } + /** + * @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport) + */ + public String getInputSource(final String requestUrl) throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, new CollectorPluginReport()); + } + /** * Given the URL returns the content via HTTP GET * * @param requestUrl the URL - * @param errorLogList the list of errors + * @param report the list of errors * @return the content of the downloaded resource * @throws CollectorException when retrying more than maxNumberOfRetry times */ - public String getInputSource(final String requestUrl, CollectorPluginErrorLogList errorLogList) + public String getInputSource(final String requestUrl, CollectorPluginReport report) throws CollectorException { - return attemptDownlaodAsString(requestUrl, 1, errorLogList); + return attemptDownloadAsString(requestUrl, 1, report); } - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorException { - try { - InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (InterruptedException e) { + private String attemptDownloadAsString(final String requestUrl, final int retryNumber, + final CollectorPluginReport report) throws CollectorException { + + try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) { + return IOUtils.toString(s); + } catch (IOException e) { + log.error(e.getMessage(), e); throw new CollectorException(e); } } private InputStream attemptDownload(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorException { + final CollectorPluginReport report) throws CollectorException, IOException { - if (retryNumber > maxNumberOfRetry) { - throw new CollectorException("Max number of retries exceeded. Cause: \n " + errorList); + if (retryNumber > getClientParams().getMaxNumberOfRetry()) { + throw new CollectorException("Max number of retries exceeded. Cause: \n " + report); } - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + log.info("Downloading attempt {} [{}]", retryNumber, requestUrl); + + InputStream input = null; + try { - InputStream input = null; + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000); + urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000); + urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent); - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (is2xx(urlConn.getResponseCode())) { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - if (is3xx(urlConn.getResponseCode())) { - // REDIRECTS - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug(String.format("The requested url %s has been moved to %s", requestUrl, newUrl)); - errorList - .add( - String - .format( - "%s %s %s. Moved to: %s", requestUrl, urlConn.getResponseCode(), - urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - if (retryAfter > 0) - Thread.sleep(retryAfter * 1000); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } - if (is4xx(urlConn.getResponseCode())) { - // CLIENT ERROR, DO NOT RETRY - errorList - .add( - String - .format( - "%s error %s: %s", requestUrl, urlConn.getResponseCode(), - urlConn.getResponseMessage())); - throw new CollectorException("4xx error: request will not be repeated. " + errorList); - } - if (is5xx(urlConn.getResponseCode())) { - // SERVER SIDE ERRORS RETRY ONLY on 503 - switch (urlConn.getResponseCode()) { - case HttpURLConnection.HTTP_UNAVAILABLE: - if (retryAfter > 0) { - log - .warn( - requestUrl + " - waiting and repeating request after suggested retry-after " - + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - } else { - log - .warn( - requestUrl + " - waiting and repeating request after default delay of " - + defaultDelay + " sec."); - Thread.sleep(defaultDelay * 1000); - } - errorList.add(requestUrl + " 503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - default: - errorList - .add( - String - .format( - "%s Error %s: %s", requestUrl, urlConn.getResponseCode(), - urlConn.getResponseMessage())); - throw new CollectorException(urlConn.getResponseCode() + " error " + errorList); - } - } - throw new CollectorException( - String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), errorList)); - } catch (MalformedURLException | NoRouteToHostException e) { - errorList.add(String.format("Error: %s for request url: %s", e.getCause(), requestUrl)); - throw new CollectorException(e + "error " + errorList); - } catch (IOException e) { - Thread.sleep(defaultDelay * 1000); - errorList.add(requestUrl + " " + e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); } - } catch (InterruptedException e) { - throw new CollectorException(e); + + int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (is2xx(urlConn.getResponseCode())) { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + if (is3xx(urlConn.getResponseCode())) { + // REDIRECTS + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.info(String.format("The requested url has been moved to %s", newUrl)); + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String.format("Moved to: %s", newUrl)); + urlConn.disconnect(); + if (retryAfter > 0) { + backoffAndSleep(retryAfter); + } + return attemptDownload(newUrl, retryNumber + 1, report); + } + if (is4xx(urlConn.getResponseCode())) { + // CLIENT ERROR, DO NOT RETRY + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String + .format( + "%s error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException("4xx error: request will not be repeated. " + report); + } + if (is5xx(urlConn.getResponseCode())) { + // SERVER SIDE ERRORS RETRY ONLY on 503 + switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_UNAVAILABLE: + if (retryAfter > 0) { + log + .warn( + requestUrl + " - waiting and repeating request after suggested retry-after " + + retryAfter + " sec."); + backoffAndSleep(retryAfter * 1000); + } else { + log + .warn( + requestUrl + " - waiting and repeating request after default delay of " + + getClientParams().getRetryDelay() + " sec."); + backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000); + } + report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, report); + default: + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String + .format( + "%s Error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException(urlConn.getResponseCode() + " error " + report); + } + } + throw new CollectorException( + String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), report)); + } catch (MalformedURLException | SocketException | UnknownHostException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e.getMessage(), e); } } @@ -200,12 +182,21 @@ public class HttpConnector2 { } } + private void backoffAndSleep(int sleepTime) throws CollectorException { + log.info("I'm going to sleep for {}ms", sleepTime); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + private int obtainRetryAfter(final Map> headerMap) { for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (headerMap.get(key).size() > 0) && NumberUtils.isCreatable(headerMap.get(key).get(0))) { - return Integer - .parseInt(headerMap.get(key).get(0)) + 10; + return Integer.parseInt(headerMap.get(key).get(0)) + 10; } } return -1; @@ -213,44 +204,13 @@ public class HttpConnector2 { private String obtainNewLocation(final Map> headerMap) throws CollectorException { for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) { return headerMap.get(key).get(0); } } throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING"); } - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - private boolean is2xx(final int statusCode) { return statusCode >= 200 && statusCode <= 299; } @@ -267,32 +227,15 @@ public class HttpConnector2 { return statusCode >= 500 && statusCode <= 599; } - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - public String getResponseType() { return responseType; } + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java new file mode 100644 index 000000000..c55d485e2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.dhp.collection.worker.utils; + +public class UnknownCollectorPluginException extends Exception { + + /** */ + private static final long serialVersionUID = -290723075076039757L; + + public UnknownCollectorPluginException() { + super(); + } + + public UnknownCollectorPluginException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public UnknownCollectorPluginException(final String message, final Throwable cause) { + super(message, cause); + } + + public UnknownCollectorPluginException(final String message) { + super(message); + } + + public UnknownCollectorPluginException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index e1b1b849c..b735ecb1f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -105,7 +105,8 @@ public class TransformSparkJobNode { log.info("Total item " + ct.getTotalItems().count()); log.info("Transformation Error item " + ct.getErrorItems().count()); - writeTotalSizeOnHDFS(spark, mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH); + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), "" + mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json new file mode 100644 index 000000000..ef65cc389 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the MDStore Version bean", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json similarity index 52% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json index 6ccba468a..f3eaf2d71 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json @@ -23,12 +23,34 @@ "paramDescription": "the End point URL to send Messages", "paramRequired": true }, - - { "paramName": "w", "paramLongName": "workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true + }, + { + "paramName": "mr", + "paramLongName": "maxNumberOfRetry", + "paramDescription": "the maximum number of admitted connection retries", + "paramRequired": false + }, + { + "paramName": "rd", + "paramLongName": "retryDelay", + "paramDescription": "the delay (ms) between retries", + "paramRequired": false + }, + { + "paramName": "ct", + "paramLongName": "connectTimeOut", + "paramDescription": "the maximum allowed time (ms) to connect to the remote host", + "paramRequired": false + }, + { + "paramName": "rt", + "paramLongName": "readTimeOut", + "paramDescription": "the maximum allowed time (ms) to receive content from the remote host", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index b74ef6b61..e7f6b9201 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -94,9 +94,22 @@ --workflowId${workflowId} --dnetMessageManagerURL${dnetMessageManagerURL} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} - + --maxNumberOfRetry${maxNumberOfRetry} + --retryDelay${retryDelay} + --connectTimeOut${connectTimeOut} + --readTimeOut${readTimeOut} + + + + + + + eu.dnetlib.dhp.collection.worker.CollectorWorkerReporter + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --namenode${nameNode} + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java deleted file mode 100644 index 103e11c33..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ /dev/null @@ -1,44 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.ssl.SSLContextBuilder; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; - -@Disabled -public class HttpConnectorTest { - - private static final Log log = LogFactory.getLog(HttpConnectorTest.class); - private static HttpConnector2 connector; - - private static final String URL = "http://cordis.europa.eu/data/reference/cordisref-H2020topics.xlsx"; - private static final String URL_MISCONFIGURED_SERVER = "https://www.alexandria.unisg.ch/cgi/oai2?verb=Identify"; - private static final String URL_GOODSNI_SERVER = "https://air.unimi.it/oai/openaire?verb=Identify"; - - private static final SSLContextBuilder sslContextBuilder = new SSLContextBuilder(); - private static SSLConnectionSocketFactory sslSocketFactory; - - @BeforeAll - public static void setUp() { - connector = new HttpConnector2(); - } - - @Test - - public void testGetInputSource() throws CollectorException { - System.out.println(connector.getInputSource(URL)); - } - - @Test - public void testGoodServers() throws CollectorException { - System.out.println(connector.getInputSource(URL_GOODSNI_SERVER)); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java index 84cad8e19..65c2833eb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.collection.worker.CollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @Disabled @@ -21,8 +22,9 @@ public class CollectorWorkerApplicationTests { @Test public void testFindPlugin() throws Exception { final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); + final HttpClientParams clientParams = new HttpClientParams(); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol(clientParams, "oai")); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol(clientParams, "OAI")); } @Test diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java new file mode 100644 index 000000000..69376d5eb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java @@ -0,0 +1,29 @@ + +package eu.dnetlib.dhp.collector.worker.utils; + +import java.io.IOException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.aggregation.common.AggregationUtility; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; + +public class CollectorPluginReportTest { + + @Test + public void testSerialize() throws IOException { + CollectorPluginReport r1 = new CollectorPluginReport(); + r1.put("a", "b"); + r1.setSuccess(true); + + String s = AggregationUtility.MAPPER.writeValueAsString(r1); + + Assertions.assertNotNull(s); + + CollectorPluginReport r2 = AggregationUtility.MAPPER.readValue(s, CollectorPluginReport.class); + + Assertions.assertTrue(r2.isSuccess(), "should be true"); + } + +} From 40df0f987ded40ab983742e3b090afbb5611d0f5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Sat, 6 Feb 2021 20:12:00 +0100 Subject: [PATCH 39/86] better logging, WIP: collectorWorker error reporting; common functions moved in DHPUtils --- .../dhp/application/ApplicationUtils.java | 30 ------- .../java/eu/dnetlib/dhp/utils/DHPUtils.java | 85 ++++++++++++++++++- .../actionmanager/project/utils/ReadCSV.java | 2 +- .../project/utils/ReadExcel.java | 2 +- .../common/AggregationUtility.java | 69 --------------- .../mdstore/MDStoreActionNode.java | 37 +++----- .../GenerateNativeStoreSparkJob.java | 2 +- .../collection/plugin/CollectorPlugin.java | 2 +- .../plugin/oai/OaiCollectorPlugin.java | 4 +- .../collection/plugin/oai/OaiIterator.java | 6 +- .../plugin/oai/OaiIteratorFactory.java | 6 +- .../{utils => }/CollectorPluginFactory.java | 3 +- .../{utils => }/CollectorPluginReport.java | 8 +- .../collection/worker/CollectorWorker.java | 5 -- .../worker/CollectorWorkerApplication.java | 10 +-- .../worker/CollectorWorkerReporter.java | 9 +- .../worker/{utils => }/HttpClientParams.java | 2 +- .../worker/{utils => }/HttpConnector2.java | 16 +++- .../UnknownCollectorPluginException.java | 2 +- .../worker/{utils => }/XmlCleaner.java | 2 +- .../transformation/TransformSparkJobNode.java | 2 +- .../project/EXCELParserTest.java | 2 +- .../CollectorWorkerApplicationTests.java | 8 +- .../utils/CollectorPluginReportTest.java | 9 +- 24 files changed, 138 insertions(+), 185 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/CollectorPluginFactory.java (84%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/CollectorPluginReport.java (86%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/HttpClientParams.java (96%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/HttpConnector2.java (93%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/UnknownCollectorPluginException.java (93%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/{utils => }/XmlCleaner.java (99%) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java index c78fb1b1f..c53b83561 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ApplicationUtils.java @@ -11,34 +11,4 @@ import com.google.common.collect.Maps; public class ApplicationUtils { - public static Configuration getHadoopConfiguration(String nameNode) { - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", nameNode); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("hadoop.home.dir", "/"); - return conf; - } - - public static void populateOOZIEEnv(final Map report) throws IOException { - File file = new File(System.getProperty("oozie.action.output.properties")); - Properties props = new Properties(); - report.forEach((k, v) -> props.setProperty(k, v)); - - try(OutputStream os = new FileOutputStream(file)) { - props.store(os, ""); - } - } - - public static void populateOOZIEEnv(final String paramName, String value) throws IOException { - Map report = Maps.newHashMap(); - report.put(paramName, value); - - populateOOZIEEnv(report); - } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index 8872174a5..8d760a2cd 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,18 +1,29 @@ package eu.dnetlib.dhp.utils; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; +import java.io.*; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.List; +import java.util.Map; +import java.util.Properties; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; import com.jayway.jsonpath.JsonPath; import net.minidev.json.JSONArray; @@ -21,6 +32,8 @@ import scala.collection.Seq; public class DHPUtils { + private static final Logger log = LoggerFactory.getLogger(DHPUtils.class); + public static Seq toSeq(List list) { return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); } @@ -79,4 +92,72 @@ public class DHPUtils { return ""; } } + + public static final ObjectMapper MAPPER = new ObjectMapper(); + + public static void writeHdfsFile(final Configuration conf, final String content, final String path) + throws IOException { + + log.info("writing file {}, size {}", path, content.length()); + try (FileSystem fs = FileSystem.get(conf); + BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { + os.write(content.getBytes(StandardCharsets.UTF_8)); + os.flush(); + } + } + + public static String readHdfsFile(Configuration conf, String path) throws IOException { + log.info("reading file {}", path); + + try (FileSystem fs = FileSystem.get(conf)) { + final Path p = new Path(path); + if (!fs.exists(p)) { + throw new FileNotFoundException(path); + } + return IOUtils.toString(fs.open(p)); + } + } + + public static T readHdfsFileAs(Configuration conf, String path, Class clazz) throws IOException { + return MAPPER.readValue(readHdfsFile(conf, path), clazz); + } + + public static void saveDataset(final Dataset mdstore, final String targetPath) { + log.info("saving dataset in: {}", targetPath); + mdstore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(targetPath); + } + + public static Configuration getHadoopConfiguration(String nameNode) { + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", nameNode); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + return conf; + } + + public static void populateOOZIEEnv(final Map report) throws IOException { + File file = new File(System.getProperty("oozie.action.output.properties")); + Properties props = new Properties(); + report.forEach((k, v) -> props.setProperty(k, v)); + + try (OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + } + + public static void populateOOZIEEnv(final String paramName, String value) throws IOException { + Map report = Maps.newHashMap(); + report.put(paramName, value); + + populateOOZIEEnv(report); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 1b9e070fe..3f64eb953 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -18,7 +18,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; +import eu.dnetlib.dhp.collection.worker.HttpConnector2; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 2ad3f5b34..c661909b0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -15,7 +15,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; +import eu.dnetlib.dhp.collection.worker.HttpConnector2; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java deleted file mode 100644 index 8dad5bb81..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationUtility.java +++ /dev/null @@ -1,69 +0,0 @@ - -package eu.dnetlib.dhp.aggregation.common; - -import java.io.BufferedOutputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.StandardCharsets; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; - -public class AggregationUtility { - - private static final Logger log = LoggerFactory.getLogger(AggregationUtility.class); - - public static final ObjectMapper MAPPER = new ObjectMapper(); - - public static void writeHdfsFile(final Configuration conf, final String content, final String path) - throws IOException { - - log.info("writing file {}, size {}", path, content.length()); - try (FileSystem fs = FileSystem.get(conf); - BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { - os.write(content.getBytes(StandardCharsets.UTF_8)); - os.flush(); - } - } - - public static String readHdfsFile(Configuration conf, String path) throws IOException { - log.info("reading file {}", path); - - try (FileSystem fs = FileSystem.get(conf)) { - final Path p = new Path(path); - if (!fs.exists(p)) { - throw new FileNotFoundException(path); - } - return IOUtils.toString(fs.open(p)); - } - } - - public static T readHdfsFileAs(Configuration conf, String path, Class clazz) throws IOException { - return MAPPER.readValue(readHdfsFile(conf, path), clazz); - } - - public static void saveDataset(final Dataset mdstore, final String targetPath) { - log.info("saving dataset in: {}", targetPath); - mdstore - .write() - .mode(SaveMode.Overwrite) - .format("parquet") - .save(targetPath); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java index 3e471cfc8..9a47a1d66 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -1,18 +1,14 @@ package eu.dnetlib.dhp.aggregation.mdstore; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; +import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; import static eu.dnetlib.dhp.application.ApplicationUtils.*; +import static eu.dnetlib.dhp.utils.DHPUtils.*; -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; import java.net.URI; -import java.util.Properties; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -80,29 +76,20 @@ public class MDStoreActionNode { throw new IllegalArgumentException( "invalid MDStoreVersion value current is " + mdStoreVersion_params); } + Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + MDSTORE_SIZE_PATH); - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + try ( + FileSystem fs = FileSystem.get(URI.create(hdfsuri), getHadoopConfiguration(hdfsuri)); + FSDataInputStream inputStream = fs.open(hdfstoreSizepath)) { - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf); + final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream)); - Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + "/size"); + fs.create(hdfstoreSizepath); + DNetRestClient + .doGET( + String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize)); + } - FSDataInputStream inputStream = fs.open(hdfstoreSizepath); - - final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream)); - - inputStream.close(); - fs.create(hdfstoreSizepath); - - DNetRestClient - .doGET(String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize)); break; } case ROLLBACK: { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 5839df2d0..5c24bb7ec 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -2,8 +2,8 @@ package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.ByteArrayInputStream; import java.io.IOException; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index d0905aade..614aa4e69 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public interface CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 29e12f312..7ec2f09be 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -15,8 +15,8 @@ import com.google.common.collect.Lists; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; +import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.HttpClientParams; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class OaiCollectorPlugin implements CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 667e7a3d3..8d913b68f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -17,9 +17,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; -import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; +import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.HttpConnector2; +import eu.dnetlib.dhp.collection.worker.XmlCleaner; public class OaiIterator implements Iterator { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index c751a94e7..f63fa37a1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,9 +3,9 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; +import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.HttpClientParams; +import eu.dnetlib.dhp.collection.worker.HttpConnector2; public class OaiIteratorFactory { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java similarity index 84% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java index ab7dad077..9668098f0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java @@ -1,9 +1,8 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -import eu.dnetlib.dhp.collection.worker.CollectorException; public class CollectorPluginFactory { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java similarity index 86% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java index b7bf539dc..2da6ac8f9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java @@ -1,7 +1,7 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.MAPPER; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.Closeable; import java.io.IOException; @@ -45,7 +45,7 @@ public class CollectorPluginReport extends LinkedHashMap impleme } public Boolean isSuccess() { - return Boolean.valueOf(get(SUCCESS)); + return containsKey(SUCCESS) && Boolean.valueOf(get(SUCCESS)); } public void setSuccess(Boolean success) { @@ -58,7 +58,7 @@ public class CollectorPluginReport extends LinkedHashMap impleme if (Objects.nonNull(fos)) { log.info("writing report {} to {}", data, path.toString()); IOUtils.write(data, fos); - ApplicationUtils.populateOOZIEEnv(this); + populateOOZIEEnv(this); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index b0efd088c..71dee0d03 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.collection.worker; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.SEQUENCE_FILE_NAME; -import static eu.dnetlib.dhp.application.ApplicationUtils.*; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; @@ -17,10 +16,6 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; -import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.message.MessageSender; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 8f26074c3..a6c254d42 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -2,29 +2,21 @@ package eu.dnetlib.dhp.collection.worker; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; -import static eu.dnetlib.dhp.application.ApplicationUtils.*; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; import java.util.Optional; import org.apache.commons.cli.ParseException; -import org.apache.commons.io.FileSystemUtils; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; -import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.message.MessageSender; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java index e0e402cfb..3a8145946 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java @@ -2,10 +2,7 @@ package eu.dnetlib.dhp.collection.worker; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.REPORT_FILE_NAME; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.MAPPER; -import static eu.dnetlib.dhp.application.ApplicationUtils.getHadoopConfiguration; -import static eu.dnetlib.dhp.application.ApplicationUtils.populateOOZIEEnv; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; import java.util.Objects; @@ -13,15 +10,11 @@ import java.util.Objects; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.yarn.webapp.hamlet.Hamlet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.aggregation.common.AggregationUtility; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.utils.UnknownCollectorPluginException; /** * CollectorWorkerReporter diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java similarity index 96% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java index e77f3680f..315dd27c2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpClientParams.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; /** * Bundles the http connection parameters driving the client behaviour. diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java index 68b1ef8ad..ee3acf432 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java @@ -1,5 +1,7 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; + +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; import java.io.InputStream; @@ -13,8 +15,6 @@ import org.apache.http.HttpHeaders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.collection.worker.CollectorException; - /** * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java * @@ -162,11 +162,19 @@ public class HttpConnector2 { } } throw new CollectorException( - String.format("Unexpected status code: %s error %s", urlConn.getResponseCode(), report)); + String + .format( + "Unexpected status code: %s errors: %s", urlConn.getResponseCode(), + MAPPER.writeValueAsString(report))); } catch (MalformedURLException | SocketException | UnknownHostException e) { log.error(e.getMessage(), e); report.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e.getMessage(), e); + } catch (SocketTimeoutException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); + return attemptDownload(requestUrl, retryNumber + 1, report); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java index c55d485e2..7134dd069 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/UnknownCollectorPluginException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; public class UnknownCollectorPluginException extends Exception { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java index 44aeb4d02..41ba02196 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker.utils; +package eu.dnetlib.dhp.collection.worker; import java.util.HashMap; import java.util.HashSet; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index b735ecb1f..c0a03e081 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -2,8 +2,8 @@ package eu.dnetlib.dhp.transformation; import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; import java.util.Map; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index e6fda9be0..7f597f950 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -14,7 +14,7 @@ import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector2; +import eu.dnetlib.dhp.collection.worker.HttpConnector2; @Disabled public class EXCELParserTest { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java index 65c2833eb..80bafd6d8 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -3,17 +3,13 @@ package eu.dnetlib.dhp.collector.worker; import static org.junit.jupiter.api.Assertions.assertNotNull; -import java.nio.file.Path; - import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.collection.worker.CollectorWorker; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.dhp.collection.worker.utils.HttpClientParams; +import eu.dnetlib.dhp.collection.worker.CollectorPluginFactory; +import eu.dnetlib.dhp.collection.worker.HttpClientParams; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @Disabled diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java index 69376d5eb..d665e5b5f 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java @@ -1,13 +1,14 @@ package eu.dnetlib.dhp.collector.worker.utils; +import static eu.dnetlib.dhp.utils.DHPUtils.*; + import java.io.IOException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import eu.dnetlib.dhp.aggregation.common.AggregationUtility; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginReport; +import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; public class CollectorPluginReportTest { @@ -17,11 +18,11 @@ public class CollectorPluginReportTest { r1.put("a", "b"); r1.setSuccess(true); - String s = AggregationUtility.MAPPER.writeValueAsString(r1); + String s = MAPPER.writeValueAsString(r1); Assertions.assertNotNull(s); - CollectorPluginReport r2 = AggregationUtility.MAPPER.readValue(s, CollectorPluginReport.class); + CollectorPluginReport r2 = MAPPER.readValue(s, CollectorPluginReport.class); Assertions.assertTrue(r2.isSuccess(), "should be true"); } From 50add4c61b991926fdf379502db3b3a0846769a9 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Feb 2021 12:19:38 +0100 Subject: [PATCH 40/86] added requestDelay to HttpConnector2 configuration; Aggregation workflow constants moved in dhp-common --- .../model => collection}/ApiDescriptor.java | 2 +- .../java/eu/dnetlib/dhp/common/Constants.java | 20 +++++++++++ .../common/AggregationConstants.java | 15 -------- .../mdstore/MDStoreActionNode.java | 3 +- .../GenerateNativeStoreSparkJob.java | 2 +- .../collection/plugin/CollectorPlugin.java | 2 +- .../plugin/oai/OaiCollectorPlugin.java | 2 +- .../collection/worker/CollectorWorker.java | 4 +-- .../worker/CollectorWorkerApplication.java | 22 ++++++++---- .../worker/CollectorWorkerReporter.java | 2 +- .../collection/worker/HttpClientParams.java | 36 +++++++++++++++++-- .../dhp/collection/worker/HttpConnector2.java | 20 +++++++---- .../transformation/TransformSparkJobNode.java | 2 +- .../collector_worker_input_parameter.json | 14 +++++--- .../dhp/collection/oozie_app/workflow.xml | 1 + .../dhp/aggregation/AggregationJobTest.java | 2 +- .../CollectorWorkerApplicationTests.java | 2 +- .../transformation/TransformationJobTest.java | 4 +-- 18 files changed, 106 insertions(+), 49 deletions(-) rename dhp-common/src/main/java/eu/dnetlib/dhp/{collector/worker/model => collection}/ApiDescriptor.java (93%) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java similarity index 93% rename from dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java index 8ba30faeb..12937a197 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collector.worker.model; +package eu.dnetlib.dhp.collection; import java.util.HashMap; import java.util.Map; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index 2b8ef4e30..eb4cb91ed 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -27,4 +27,24 @@ public class Constants { coarCodeLabelMap.put("c_f1cf", "EMBARGO"); } + public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String REPORT_FILE_NAME = "/report"; + public static final String MDSTORE_DATA_PATH = "/store"; + public static final String MDSTORE_SIZE_PATH = "/size"; + + public static final String COLLECTION_MODE = "collectionMode"; + public static final String METADATA_ENCODING = "metadataEncoding"; + public static final String OOZIE_WF_PATH = "oozieWfPath"; + public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL"; + + public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry"; + public static final String REQUEST_DELAY = "requestDelay"; + public static final String RETRY_DELAY = "retryDelay"; + public static final String CONNECT_TIMEOUT = "connectTimeOut"; + public static final String READ_TIMEOUT = "readTimeOut"; + + public static final String CONTENT_TOTALITEMS = "TotalItems"; + public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; + public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java deleted file mode 100644 index 8e0b7260d..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregationConstants.java +++ /dev/null @@ -1,15 +0,0 @@ - -package eu.dnetlib.dhp.aggregation.common; - -public class AggregationConstants { - - public static final String SEQUENCE_FILE_NAME = "/sequence_file"; - public static final String REPORT_FILE_NAME = "/report"; - public static final String MDSTORE_DATA_PATH = "/store"; - public static final String MDSTORE_SIZE_PATH = "/size"; - - public static final String CONTENT_TOTALITEMS = "TotalItems"; - public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; - public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java index 9a47a1d66..829921dd8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -1,8 +1,7 @@ package eu.dnetlib.dhp.aggregation.mdstore; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; -import static eu.dnetlib.dhp.application.ApplicationUtils.*; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.net.URI; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 5c24bb7ec..ee82cc94f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.collection; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.utils.DHPUtils.*; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 614aa4e69..e2be481ed 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -3,9 +3,9 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; +import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; -import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public interface CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 7ec2f09be..84228abf4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -13,11 +13,11 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; +import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; import eu.dnetlib.dhp.collection.worker.HttpClientParams; -import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; public class OaiCollectorPlugin implements CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index 71dee0d03..c2d32019d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.collection.worker; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.SEQUENCE_FILE_NAME; +import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; @@ -15,8 +15,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.message.MessageSender; public class CollectorWorker { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index a6c254d42..6e4237bee 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.collection.worker; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; @@ -17,7 +17,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.message.MessageSender; /** @@ -55,7 +55,7 @@ public class CollectorWorkerApplication { final String mdStoreVersion = argumentParser.get("mdStoreVersion"); log.info("mdStoreVersion is {}", mdStoreVersion); - final String dnetMessageManagerURL = argumentParser.get("dnetMessageManagerURL"); + final String dnetMessageManagerURL = argumentParser.get(DNET_MESSAGE_MGR_URL); log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); final String workflowId = argumentParser.get("workflowId"); @@ -87,15 +87,23 @@ public class CollectorWorkerApplication { clientParams .setMaxNumberOfRetry( Optional - .ofNullable(argumentParser.get("maxNumberOfRetry")) + .ofNullable(argumentParser.get(MAX_NUMBER_OF_RETRY)) .map(Integer::parseInt) .orElse(HttpClientParams._maxNumberOfRetry)); log.info("maxNumberOfRetry is {}", clientParams.getMaxNumberOfRetry()); + clientParams + .setRequestDelay( + Optional + .ofNullable(argumentParser.get(REQUEST_DELAY)) + .map(Integer::parseInt) + .orElse(HttpClientParams._requestDelay)); + log.info("requestDelay is {}", clientParams.getRequestDelay()); + clientParams .setRetryDelay( Optional - .ofNullable(argumentParser.get("retryDelay")) + .ofNullable(argumentParser.get(RETRY_DELAY)) .map(Integer::parseInt) .orElse(HttpClientParams._retryDelay)); log.info("retryDelay is {}", clientParams.getRetryDelay()); @@ -103,7 +111,7 @@ public class CollectorWorkerApplication { clientParams .setConnectTimeOut( Optional - .ofNullable(argumentParser.get("connectTimeOut")) + .ofNullable(argumentParser.get(CONNECT_TIMEOUT)) .map(Integer::parseInt) .orElse(HttpClientParams._connectTimeOut)); log.info("connectTimeOut is {}", clientParams.getConnectTimeOut()); @@ -111,7 +119,7 @@ public class CollectorWorkerApplication { clientParams .setReadTimeOut( Optional - .ofNullable(argumentParser.get("readTimeOut")) + .ofNullable(argumentParser.get(READ_TIMEOUT)) .map(Integer::parseInt) .orElse(HttpClientParams._readTimeOut)); log.info("readTimeOut is {}", clientParams.getReadTimeOut()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java index 3a8145946..3f6fc4784 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.collection.worker; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.REPORT_FILE_NAME; +import static eu.dnetlib.dhp.common.Constants.REPORT_FILE_NAME; import static eu.dnetlib.dhp.utils.DHPUtils.*; import java.io.IOException; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java index 315dd27c2..f45790460 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java @@ -6,22 +6,46 @@ package eu.dnetlib.dhp.collection.worker; */ public class HttpClientParams { + // Defaults public static int _maxNumberOfRetry = 3; + public static int _requestDelay = 0; // milliseconds public static int _retryDelay = 10; // seconds public static int _connectTimeOut = 10; // seconds public static int _readTimeOut = 30; // seconds + /** + * Maximum number of allowed retires before failing + */ private int maxNumberOfRetry; + + /** + * Delay between request (Milliseconds) + */ + private int requestDelay; + + /** + * Time to wait after a failure before retrying (Seconds) + */ private int retryDelay; + + /** + * Connect timeout (Seconds) + */ private int connectTimeOut; + + /** + * Read timeout (Seconds) + */ private int readTimeOut; public HttpClientParams() { - this(_maxNumberOfRetry, _retryDelay, _connectTimeOut, _readTimeOut); + this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut); } - public HttpClientParams(int maxNumberOfRetry, int retryDelay, int connectTimeOut, int readTimeOut) { + public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut, + int readTimeOut) { this.maxNumberOfRetry = maxNumberOfRetry; + this.requestDelay = requestDelay; this.retryDelay = retryDelay; this.connectTimeOut = connectTimeOut; this.readTimeOut = readTimeOut; @@ -35,6 +59,14 @@ public class HttpClientParams { this.maxNumberOfRetry = maxNumberOfRetry; } + public int getRequestDelay() { + return requestDelay; + } + + public void setRequestDelay(int requestDelay) { + this.requestDelay = requestDelay; + } + public int getRetryDelay() { return retryDelay; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java index ee3acf432..368c89509 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java @@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory; /** * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java * - * @author jochen, michele, andrea, alessia + * @author jochen, michele, andrea, alessia, claudio */ public class HttpConnector2 { @@ -83,14 +83,22 @@ public class HttpConnector2 { final CollectorPluginReport report) throws CollectorException, IOException { if (retryNumber > getClientParams().getMaxNumberOfRetry()) { - throw new CollectorException("Max number of retries exceeded. Cause: \n " + report); + final String msg = String + .format( + "Max number of retries (%s/%s) exceeded, failing.", + retryNumber, getClientParams().getMaxNumberOfRetry()); + log.error(msg); + throw new CollectorException(msg); } - log.info("Downloading attempt {} [{}]", retryNumber, requestUrl); + log.info("Request attempt {} [{}]", retryNumber, requestUrl); InputStream input = null; try { + if (getClientParams().getRequestDelay() > 0) { + backoffAndSleep(getClientParams().getRequestDelay()); + } final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); urlConn.setInstanceFollowRedirects(false); urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000); @@ -190,10 +198,10 @@ public class HttpConnector2 { } } - private void backoffAndSleep(int sleepTime) throws CollectorException { - log.info("I'm going to sleep for {}ms", sleepTime); + private void backoffAndSleep(int sleepTimeMs) throws CollectorException { + log.info("I'm going to sleep for {}ms", sleepTimeMs); try { - Thread.sleep(sleepTime); + Thread.sleep(sleepTimeMs); } catch (InterruptedException e) { log.error(e.getMessage(), e); throw new CollectorException(e); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index c0a03e081..e628e7645 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.transformation; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.*; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.utils.DHPUtils.*; diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json index f3eaf2d71..cd4b8224b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json @@ -30,25 +30,31 @@ "paramRequired": true }, { - "paramName": "mr", + "paramName": "mnr", "paramLongName": "maxNumberOfRetry", "paramDescription": "the maximum number of admitted connection retries", "paramRequired": false }, { - "paramName": "rd", + "paramName": "rqd", + "paramLongName": "requestDelay", + "paramDescription": "the delay (ms) between requests", + "paramRequired": false + }, + { + "paramName": "rtd", "paramLongName": "retryDelay", "paramDescription": "the delay (ms) between retries", "paramRequired": false }, { - "paramName": "ct", + "paramName": "cto", "paramLongName": "connectTimeOut", "paramDescription": "the maximum allowed time (ms) to connect to the remote host", "paramRequired": false }, { - "paramName": "rt", + "paramName": "rto", "paramLongName": "readTimeOut", "paramDescription": "the maximum allowed time (ms) to receive content from the remote host", "paramRequired": false diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index e7f6b9201..fe8eea370 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --dnetMessageManagerURL${dnetMessageManagerURL} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} --retryDelay${retryDelay} --connectTimeOut${connectTimeOut} --readTimeOut${readTimeOut} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java index 3cb66d5ee..ff3ff3b6e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.aggregation; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.File; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java index 80bafd6d8..975ef944e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -8,9 +8,9 @@ import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.HttpClientParams; -import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor; @Disabled public class CollectorWorkerApplicationTests { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 9d6dacf0c..997727e33 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,14 +1,12 @@ package eu.dnetlib.dhp.transformation; -import static eu.dnetlib.dhp.aggregation.common.AggregationConstants.MDSTORE_DATA_PATH; +import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.lenient; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Collections; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; From bebc54d5bff5fe4f03e0b596be6377835cc72cab Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Feb 2021 18:06:25 +0100 Subject: [PATCH 41/86] seq file storing native records is now compressed --- .../eu/dnetlib/dhp/collection/worker/CollectorWorker.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java index c2d32019d..945eff8b0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java @@ -11,6 +11,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,7 +64,8 @@ public class CollectorWorker { conf, SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + SequenceFile.Writer.valueClass(Text.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { final IntWritable key = new IntWritable(counter.get()); final Text value = new Text(); plugin From bae029f8288e20c5888e1ef39489d88f2a08ce96 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Feb 2021 18:07:23 +0100 Subject: [PATCH 42/86] collection_java_xmx allows to declare the heap size allocated for the java actions involved in the metadata collectionw workflow --- .../aggregation/mdstore/MDStoreActionNode.java | 2 ++ .../worker/CollectorWorkerApplication.java | 4 +++- .../dhp/collection/oozie_app/workflow.xml | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java index 829921dd8..09f3ffd63 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/mdstore/MDStoreActionNode.java @@ -45,6 +45,8 @@ public class MDStoreActionNode { "/eu/dnetlib/dhp/collection/mdstore_action_parameters.json"))); argumentParser.parseArgument(args); + log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024)); + final MDAction action = MDAction.valueOf(argumentParser.get("action")); log.info("Current action is {}", action); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java index 6e4237bee..17f09ee5a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java @@ -41,11 +41,13 @@ public class CollectorWorkerApplication { final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( IOUtils .toString( - CollectorWorker.class + CollectorWorkerApplication.class .getResourceAsStream( "/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json"))); argumentParser.parseArgument(args); + log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024)); + final String hdfsuri = argumentParser.get("namenode"); log.info("hdfsURI is {}", hdfsuri); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index fe8eea370..5497b2c50 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -41,6 +41,14 @@ collectionMode Should be REFRESH or INCREMENTAL + + + collection_java_xmx + -Xmx200m + Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb. + + + @@ -65,6 +73,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionREAD_LOCK --mdStoreID${mdStoreID} --mdStoreManagerURI${mdStoreManagerURI} @@ -77,6 +86,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionNEW_VERSION --mdStoreID${mdStoreID} --mdStoreManagerURI${mdStoreManagerURI} @@ -89,6 +99,7 @@ eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication + ${collection_java_xmx} --apidescriptor${apiDescription} --namenode${nameNode} --workflowId${workflowId} @@ -108,6 +119,7 @@ eu.dnetlib.dhp.collection.worker.CollectorWorkerReporter + ${collection_java_xmx} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --namenode${nameNode} @@ -153,6 +165,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} @@ -164,6 +177,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionCOMMIT --namenode${nameNode} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} @@ -184,6 +198,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionREAD_UNLOCK --mdStoreManagerURI${mdStoreManagerURI} --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} @@ -195,6 +210,7 @@ eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} --actionROLLBACK --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --mdStoreManagerURI${mdStoreManagerURI} From ebcc3ec14f597651dea04469bf645621e5aa148a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 11 Feb 2021 16:25:51 +0100 Subject: [PATCH 43/86] updated wrong datacite identifier in trasformation --- .../actionmanager/datacite/DataciteToOAFTransformation.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 933f1445f..dc5b8b093 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -66,7 +66,7 @@ object DataciteToOAFTransformation { val unknown_repository: HostedByMapType = HostedByMapType("openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18", "Unknown Repository", "Unknown Repository", Some(1.0F)) val dataInfo: DataInfo = generateDataInfo("0.9") - val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::datacite", "Datacite") + val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue("openaire____::9e3be59865b2c1c335d32dae2fe7b254", "Datacite") val hostedByMap: Map[String, HostedByMapType] = { val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString From 17e6f1934edb0eab39a74173a443c58184272776 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 12 Feb 2021 11:48:11 +0100 Subject: [PATCH 44/86] fixed NPE on cleaner --- .../datacite/AbstractRestClient.scala | 14 +- .../dhp/transformation/xslt/Cleaner.java | 6 +- .../transformation/TransformationJobTest.java | 12 +- .../eu/dnetlib/dhp/transform/ext_simple.xsl | 1 + .../eu/dnetlib/dhp/transform/input.xml | 2 +- .../eu/dnetlib/dhp/transform/input_zenodo.xml | 99 ++++ .../eu/dnetlib/dhp/transform/zenodo_tr.xslt | 444 ++++++++++++++++++ 7 files changed, 571 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala index 852147ccd..3c7770075 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -53,11 +53,21 @@ abstract class AbstractRestClient extends Iterator[String]{ } + + private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ val client = HttpClients.createDefault try { - val response = client.execute(r) - IOUtils.toString(response.getEntity.getContent) + var tries = 4 + while (tries > 0) { + val response = client.execute(r) + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } + else + return IOUtils.toString(response.getEntity.getContent) + } + "" } catch { case e: Throwable => throw new RuntimeException("Error on executing request ", e) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 7b0fdd484..8b7024cbe 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -27,13 +27,17 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public SequenceType[] getArgumentTypes() { return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE), SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) }; } @Override public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0){ + return new XdmAtomicValue(""); + } final String currentValue = xdmValues[0].itemAt(0).getStringValue(); final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 997727e33..69b31b30f 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -61,13 +61,19 @@ public class TransformationJobTest extends AbstractVocabularyTest { // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); - mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml"))); // We Load the XSLT transformation Rule from the classpath - XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/ext_simple.xsl"); + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); + + + MetadataRecord result = tr.call(mr); + + + // Print the record - System.out.println(tr.call(mr).getBody()); + System.out.println(result.getBody()); // TODO Create significant Assert } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index becd3a05e..c114217c2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -6,6 +6,7 @@ version="2.0" exclude-result-prefixes="xsl vocabulary"> + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml index ebe8e919b..3d136d56d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input.xml @@ -6,7 +6,7 @@ PSNCRepository:PSNCExternalRepository:Departments PSNCRepository:PSNCExternalRepository:Departments:NetworkServices PSNCRepository:PSNCExternalRepository - PSNCRepository:PSNCExternalRepository:publications + aRTIcle - Letter to the editor PSNCRepository diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml new file mode 100644 index 000000000..043eae343 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_zenodo.xml @@ -0,0 +1,99 @@ + + + + r37b0ad08687::000374d100a9db469bd42b69dbb40b36 + 10.5281/zenodo.3234526 + 2020-03-23T03:03:50.72Z + r37b0ad08687 + oai:zenodo.org:3234526 + 2020-03-19T10:58:08Z + openaire_data + user-epfl + + + + true + 3.1 + CERN.ZENODO + + + 10.5281/zenodo.3234526 + + + Nouchi, Vincent + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Lavanchy, Sébastien + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Baracchini, Theo + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Wüest, Alfred + Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + + + Bouffard, Damien + Eawag, Swiss Federal Institute of Aquatic Science and Technology, Surface Waters – Research and Management, Kastanienbaum, 6047, Switzerland + + + + Temperature and ADCP data collected on Lake Geneva between 2015 and 2017 + + Zenodo + 2019 + + Lake Geneva + temperature + ADCP + + + 2019-05-29 + + + + 10.5281/zenodo.3234525 + https://zenodo.org/communities/epfl + + 1.0.0 + + Creative Commons Attribution 4.0 International + Open Access + + +

Data collected between 2015 and 2017 on Lake Geneva by Acoustic Doppler Current Profiler (ADCP) and CTDs. One file includes all the temperature profiles, the two others are the ADCP data (up- and down-looking) at the SHL2 station (centre of the main basin). Coordinates of the SHL2 station are 534700 and 144950 in the Swiss CH1903 coordinate system. The file with the CTD data contains the coordinates of the sample location (lat, lon), times (in MATLAB time), depths (in meters) and temperatures (in &deg;C).

+ +

All files are in MATLAB .mat format.

+
+
+
+
+
+ + + + https%3A%2F%2Fzenodo.org%2Foai2d + oai:zenodo.org:3234526 + 2020-03-19T10:58:08Z + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt new file mode 100644 index 000000000..e67ed9dda --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + CLOSED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 29c6f7e255cd6b23d254833789827f8b6c869a73 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 12 Feb 2021 12:31:02 +0100 Subject: [PATCH 45/86] classes related to the collection workflow moved into common package; implemented MongoDB collection plugins --- .../eu/dnetlib/dhp/message/MessageSender.java | 8 +- dhp-workflows/dhp-aggregation/pom.xml | 5 +- .../actionmanager/project/utils/ReadCSV.java | 2 +- .../project/utils/ReadExcel.java | 2 +- .../{worker => }/CollectorException.java | 2 +- .../{worker => }/CollectorPluginReport.java | 9 +- .../{worker => }/CollectorWorker.java | 46 +++++-- .../CollectorWorkerApplication.java | 32 +++-- .../{worker => }/CollectorWorkerReporter.java | 2 +- .../{worker => }/HttpClientParams.java | 2 +- .../{worker => }/HttpConnector2.java | 2 +- .../UnknownCollectorPluginException.java | 2 +- .../collection/{worker => }/XmlCleaner.java | 2 +- .../collection/plugin/CollectorPlugin.java | 4 +- .../mongodb/MongoDbCollectorPlugin.java | 59 ++++++++ .../mongodb/MongoDbDumpCollectorPlugin.java | 54 ++++++++ .../plugin/oai/OaiCollectorPlugin.java | 6 +- .../collection/plugin/oai/OaiIterator.java | 8 +- .../plugin/oai/OaiIteratorFactory.java | 6 +- .../worker/CollectorPluginFactory.java | 20 --- .../dhp/collection/oozie_app/workflow.xml | 4 +- .../project/EXCELParserTest.java | 4 +- .../dhp/collection/CollectionJobTest.java | 130 ------------------ .../collection/CollectionWorkflowTest.java | 113 +++++++++++++++ .../GenerateNativeStoreSparkJobTest.java} | 84 +++++++++-- .../CollectorWorkerApplicationTests.java | 10 -- .../utils/CollectorPluginReportTest.java | 2 +- .../transformation/TransformationJobTest.java | 4 +- .../dnetlib/dhp/collection/apiDescriptor.json | 10 ++ .../eu/dnetlib/dhp/oa/provision/fields.xml | 2 + .../eu/dnetlib/dhp/oa/provision/record.xml | 4 +- 31 files changed, 411 insertions(+), 229 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/CollectorException.java (93%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/CollectorPluginReport.java (90%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/CollectorWorker.java (64%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/CollectorWorkerApplication.java (86%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/CollectorWorkerReporter.java (97%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/HttpClientParams.java (97%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/HttpConnector2.java (99%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/UnknownCollectorPluginException.java (94%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/{worker => }/XmlCleaner.java (99%) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java delete mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java rename dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/{aggregation/AggregationJobTest.java => collection/GenerateNativeStoreSparkJobTest.java} (73%) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 3f9d07a7e..16bb0c97e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -1,6 +1,9 @@ package eu.dnetlib.dhp.message; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPut; @@ -30,13 +33,15 @@ public class MessageSender { private final String workflowId; + private ExecutorService executorService = Executors.newCachedThreadPool(); + public MessageSender(final String dnetMessageEndpoint, final String workflowId) { this.workflowId = workflowId; this.dnetMessageEndpoint = dnetMessageEndpoint; } public void sendMessage(final Message message) { - new Thread(() -> _sendMessage(message)).start(); + executorService.submit(() -> _sendMessage(message)); } public void sendMessage(final Long current, final Long total) { @@ -67,7 +72,6 @@ public class MessageSender { .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) .setSocketTimeout(SOCKET_TIMEOUT_MS) .build(); - ; try (final CloseableHttpClient client = HttpClients .custom() diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index f0ee42542..6887be55e 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -106,7 +106,10 @@ commons-compress
- + + org.mongodb + mongo-java-driver +
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 3f64eb953..cad6b94e1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -18,7 +18,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.HttpConnector2; +import eu.dnetlib.dhp.collection.HttpConnector2; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index c661909b0..fc3b38ac5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -15,7 +15,7 @@ import org.apache.hadoop.fs.Path; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.HttpConnector2; +import eu.dnetlib.dhp.collection.HttpConnector2; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java index 71d225f13..144d297e6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorException.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; public class CollectorException extends Exception { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java similarity index 90% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java index 2da6ac8f9..a7204523a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.utils.DHPUtils.*; @@ -17,15 +17,10 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; -import eu.dnetlib.dhp.application.ApplicationUtils; - public class CollectorPluginReport extends LinkedHashMap implements Closeable { private static final Logger log = LoggerFactory.getLogger(CollectorPluginReport.class); - @JsonIgnore - private FileSystem fs; - @JsonIgnore private Path path; @@ -38,9 +33,7 @@ public class CollectorPluginReport extends LinkedHashMap impleme } public CollectorPluginReport(FileSystem fs, Path path) throws IOException { - this.fs = fs; this.path = path; - this.fos = fs.create(path); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java similarity index 64% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 945eff8b0..ace725bfd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -1,23 +1,27 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME; import java.io.IOException; +import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.hadoop.conf.Configuration; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.DeflateCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.message.MessageSender; public class CollectorWorker { @@ -26,7 +30,7 @@ public class CollectorWorker { private final ApiDescriptor api; - private final Configuration conf; + private final FileSystem fileSystem; private final MDStoreVersion mdStoreVersion; @@ -38,13 +42,13 @@ public class CollectorWorker { public CollectorWorker( final ApiDescriptor api, - final Configuration conf, + final FileSystem fileSystem, final MDStoreVersion mdStoreVersion, final HttpClientParams clientParams, final MessageSender messageSender, final CollectorPluginReport report) { this.api = api; - this.conf = conf; + this.fileSystem = fileSystem; this.mdStoreVersion = mdStoreVersion; this.clientParams = clientParams; this.messageSender = messageSender; @@ -56,16 +60,16 @@ public class CollectorWorker { final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME; log.info("outputPath path is {}", outputPath); - final CollectorPlugin plugin = CollectorPluginFactory.getPluginByProtocol(clientParams, api.getProtocol()); + final CollectorPlugin plugin = getCollectorPlugin(); final AtomicInteger counter = new AtomicInteger(0); try (SequenceFile.Writer writer = SequenceFile .createWriter( - conf, + fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class), - SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) { + SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { final IntWritable key = new IntWritable(counter.get()); final Text value = new Text(); plugin @@ -94,4 +98,26 @@ public class CollectorWorker { } } + private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException { + switch (StringUtils.lowerCase(StringUtils.trim(api.getProtocol()))) { + case "oai": + return new OaiCollectorPlugin(clientParams); + case "other": + final String plugin = Optional + .ofNullable(api.getParams().get("other_plugin_type")) + .orElseThrow(() -> new UnknownCollectorPluginException("other_plugin_type")); + + switch (plugin) { + case "mdstore_mongodb_dump": + return new MongoDbDumpCollectorPlugin(fileSystem); + case "mdstore_mongodb": + return new MongoDbCollectorPlugin(); + default: + throw new UnknownCollectorPluginException("Unknown plugin type: " + plugin); + } + default: + throw new UnknownCollectorPluginException("Unknown protocol: " + api.getProtocol()); + } + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java similarity index 86% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java index 17f09ee5a..0eea0837c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.utils.DHPUtils.*; @@ -9,7 +9,6 @@ import java.util.Optional; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; @@ -17,7 +16,6 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.message.MessageSender; /** @@ -32,6 +30,12 @@ public class CollectorWorkerApplication { private static final Logger log = LoggerFactory.getLogger(CollectorWorkerApplication.class); + private FileSystem fileSystem; + + public CollectorWorkerApplication(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + /** * @param args */ @@ -63,6 +67,18 @@ public class CollectorWorkerApplication { final String workflowId = argumentParser.get("workflowId"); log.info("workflowId is {}", workflowId); + final HttpClientParams clientParams = getClientParams(argumentParser); + + final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); + final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsuri)); + + new CollectorWorkerApplication(fileSystem) + .run(mdStoreVersion, clientParams, api, dnetMessageManagerURL, workflowId); + } + + protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api, + String dnetMessageManagerURL, String workflowId) throws IOException { + final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId); final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); @@ -70,13 +86,9 @@ public class CollectorWorkerApplication { final String reportPath = currentVersion.getHdfsPath() + REPORT_FILE_NAME; log.info("report path is {}", reportPath); - final HttpClientParams clientParams = getClientParams(argumentParser); - - final ApiDescriptor api = MAPPER.readValue(apiDescriptor, ApiDescriptor.class); - final Configuration conf = getHadoopConfiguration(hdfsuri); - - try (CollectorPluginReport report = new CollectorPluginReport(FileSystem.get(conf), new Path(reportPath))) { - final CollectorWorker worker = new CollectorWorker(api, conf, currentVersion, clientParams, ms, report); + try (CollectorPluginReport report = new CollectorPluginReport(fileSystem, new Path(reportPath))) { + final CollectorWorker worker = new CollectorWorker(api, fileSystem, currentVersion, clientParams, ms, + report); worker.collect(); report.setSuccess(true); } catch (Throwable e) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java similarity index 97% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java index 3f6fc4784..d8cf3ec02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorWorkerReporter.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.common.Constants.REPORT_FILE_NAME; import static eu.dnetlib.dhp.utils.DHPUtils.*; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java similarity index 97% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java index f45790460..ab0d5cc02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpClientParams.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpClientParams.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; /** * Bundles the http connection parameters driving the client behaviour. diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java index 368c89509..72a2a70a2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.utils.DHPUtils.*; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java similarity index 94% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java index 7134dd069..2b0a98e53 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/UnknownCollectorPluginException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/UnknownCollectorPluginException.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; public class UnknownCollectorPluginException extends Exception { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java index 41ba02196..c674031f6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection.worker; +package eu.dnetlib.dhp.collection; import java.util.HashMap; import java.util.HashSet; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index e2be481ed..0a4b3a892 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -4,8 +4,8 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; import eu.dnetlib.dhp.collection.ApiDescriptor; -import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.CollectorPluginReport; public interface CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java new file mode 100644 index 000000000..7d1952f9c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java @@ -0,0 +1,59 @@ + +package eu.dnetlib.dhp.collection.plugin.mongodb; + +import java.util.Optional; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.bson.Document; + +import com.mongodb.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; + +public class MongoDbCollectorPlugin implements CollectorPlugin { + + public static final String MONGODB_HOST = "mongodb_host"; + public static final String MONGODB_PORT = "mongodb_port"; + public static final String MONGODB_COLLECTION = "mongodb_collection"; + public static final String MONGODB_DBNAME = "mongodb_dbname"; + + @Override + public Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException { + + final String host = Optional + .ofNullable(api.getParams().get(MONGODB_HOST)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_HOST))); + + final Integer port = Optional + .ofNullable(api.getParams().get(MONGODB_PORT)) + .map(Integer::parseInt) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_PORT))); + + final String dbName = Optional + .ofNullable(api.getParams().get(MONGODB_DBNAME)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME))); + + final String collection = Optional + .ofNullable(api.getParams().get(MONGODB_COLLECTION)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_COLLECTION))); + + final MongoClient mongoClient = new MongoClient(host, port); + final MongoDatabase database = mongoClient.getDatabase(dbName); + final MongoCollection mdstore = database.getCollection(collection); + + long size = mdstore.count(); + + return StreamSupport + .stream( + Spliterators.spliterator(mdstore.find().iterator(), size, Spliterator.SIZED), false) + .map(doc -> doc.getString("body")); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java new file mode 100644 index 000000000..d08732593 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java @@ -0,0 +1,54 @@ + +package eu.dnetlib.dhp.collection.plugin.mongodb; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.Optional; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class MongoDbDumpCollectorPlugin implements CollectorPlugin { + + public static final String PATH_PARAM = "path"; + public static final String BODY_JSONPATH = "$.body"; + + public FileSystem fileSystem; + + public MongoDbDumpCollectorPlugin(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + @Override + public Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException { + + final Path path = Optional + .ofNullable(api.getParams().get("path")) + .map(Path::new) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", PATH_PARAM))); + + try { + if (!fileSystem.exists(path)) { + throw new CollectorException("path does not exist: " + path.toString()); + } + + return new BufferedReader( + new InputStreamReader(new GZIPInputStream(fileSystem.open(path)), Charset.defaultCharset())) + .lines() + .map(s -> DHPUtils.getJPathString(BODY_JSONPATH, s)); + + } catch (IOException e) { + throw new CollectorException(e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 84228abf4..8efdeb838 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -14,10 +14,10 @@ import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.collection.HttpClientParams; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.HttpClientParams; public class OaiCollectorPlugin implements CollectorPlugin { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 8d913b68f..edfcb7bb5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -16,10 +16,10 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.HttpConnector2; -import eu.dnetlib.dhp.collection.worker.XmlCleaner; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.collection.HttpConnector2; +import eu.dnetlib.dhp.collection.XmlCleaner; public class OaiIterator implements Iterator { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index f63fa37a1..d7b5de087 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,9 +3,9 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; -import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; -import eu.dnetlib.dhp.collection.worker.HttpClientParams; -import eu.dnetlib.dhp.collection.worker.HttpConnector2; +import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.HttpConnector2; public class OaiIteratorFactory { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java deleted file mode 100644 index 9668098f0..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/CollectorPluginFactory.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; - -public class CollectorPluginFactory { - - public static CollectorPlugin getPluginByProtocol(final HttpClientParams clientParams, final String protocol) - throws UnknownCollectorPluginException { - if (protocol == null) - throw new UnknownCollectorPluginException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()) { - case "oai": - return new OaiCollectorPlugin(clientParams); - default: - throw new UnknownCollectorPluginException("Unknown protocol"); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 5497b2c50..1bab59659 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -98,7 +98,7 @@ - eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication + eu.dnetlib.dhp.collection.CollectorWorkerApplication ${collection_java_xmx} --apidescriptor${apiDescription} --namenode${nameNode} @@ -118,7 +118,7 @@ - eu.dnetlib.dhp.collection.worker.CollectorWorkerReporter + eu.dnetlib.dhp.collection.CollectorWorkerReporter ${collection_java_xmx} --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --namenode${nameNode} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index 7f597f950..acb4caa22 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -13,8 +13,8 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; -import eu.dnetlib.dhp.collection.worker.CollectorException; -import eu.dnetlib.dhp.collection.worker.HttpConnector2; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpConnector2; @Disabled public class EXCELParserTest { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java deleted file mode 100644 index 6f7bb2bc2..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ /dev/null @@ -1,130 +0,0 @@ - -package eu.dnetlib.dhp.collection; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.*; -import org.junit.jupiter.api.io.TempDir; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreCurrentVersion; -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.dhp.schema.common.ModelSupport; - -public class CollectionJobTest { - - private static SparkSession spark; - - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setAppName(CollectionJobTest.class.getSimpleName()); - conf.setMaster("local"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } - - @AfterAll - public static void afterAll() { - spark.stop(); - } - - @Test - public void testJSONSerialization() throws Exception { - final String s = IOUtils.toString(getClass().getResourceAsStream("input.json")); - System.out.println("s = " + s); - final ObjectMapper mapper = new ObjectMapper(); - MDStoreVersion mi = mapper.readValue(s, MDStoreVersion.class); - - assertNotNull(mi); - - } - - @Test - public void tesCollection(@TempDir Path testDir) throws Exception { - final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - Assertions.assertNotNull(new ObjectMapper().writeValueAsString(provenance)); - - GenerateNativeStoreSparkJob - .main( - new String[] { - "issm", "true", - "-w", "wid", - "-e", "XML", - "-d", "" + System.currentTimeMillis(), - "-p", new ObjectMapper().writeValueAsString(provenance), - "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", testDir.toString() + "/store", - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", "" - }); - - // TODO introduce useful assertions - - } - - @Test - public void testGenerationMetadataRecord() throws Exception { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - - assertNotNull(record.getId()); - assertNotNull(record.getOriginalId()); - } - - @Test - public void TestEquals() throws IOException { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - final MetadataRecord record1 = GenerateNativeStoreSparkJob - .parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - - record.setBody("ciao"); - record1.setBody("mondo"); - - assertNotNull(record); - assertNotNull(record1); - assertEquals(record, record1); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java new file mode 100644 index 000000000..cd6275d7f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionWorkflowTest.java @@ -0,0 +1,113 @@ + +package eu.dnetlib.dhp.collection; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class CollectionWorkflowTest { + + private static final Logger log = LoggerFactory.getLogger(CollectionWorkflowTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static Path workingDir; + + private static DistributedFileSystem fileSystem; + + // private static MiniDFSCluster hdfsCluster; + + private static ApiDescriptor api; + private static String mdStoreVersion; + + private static final String encoding = "XML"; + private static final String dateOfCollection = System.currentTimeMillis() + ""; + private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; + private static String provenance; + + private static final String msgMgrUrl = "http://localhost:%s/mock/mvc/dhp/message"; + + @BeforeAll + protected static void beforeAll() throws Exception { + provenance = IOUtils + .toString(CollectionWorkflowTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + + workingDir = Files.createTempDirectory(CollectionWorkflowTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + /* + * Configuration conf = new Configuration(); conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, + * workingDir.toString()); hdfsCluster = new MiniDFSCluster.Builder(conf).build(); fileSystem = + * hdfsCluster.getFileSystem(); api = OBJECT_MAPPER .readValue( + * IOUtils.toString(CollectionWorkflowTest.class.getResourceAsStream("apiDescriptor.json")), + * ApiDescriptor.class); mdStoreVersion = OBJECT_MAPPER + * .writeValueAsString(prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json")); + */ + } + + @AfterAll + protected static void tearDown() { + /* + * hdfsCluster.shutdown(); FileUtil.fullyDelete(workingDir.toFile()); + */ + + } + + /** + + + eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication + ${collection_java_xmx} + --apidescriptor${apiDescription} + --namenode${nameNode} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} + --connectTimeOut${connectTimeOut} + --readTimeOut${readTimeOut} + + + + + + */ + // @Test + // @Order(1) + public void testCollectorWorkerApplication() throws Exception { + + final HttpClientParams httpClientParams = new HttpClientParams(); + + // String url = String.format(msgMgrUrl, wireMockServer.port()); + + // new CollectorWorkerApplication(fileSystem).run(mdStoreVersion, httpClientParams, api, url, "1234"); + + } + + public static MDStoreVersion prepareVersion(String filename) throws IOException { + MDStoreVersion mdstore = OBJECT_MAPPER + .readValue(IOUtils.toString(CollectionWorkflowTest.class.getResource(filename)), MDStoreVersion.class); + mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); + return mdstore; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java similarity index 73% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java index ff3ff3b6e..723f030a6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/aggregation/AggregationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java @@ -1,8 +1,9 @@ -package eu.dnetlib.dhp.aggregation; +package eu.dnetlib.dhp.collection; import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import java.io.File; import java.io.FileOutputStream; @@ -36,14 +37,14 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; +import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.dhp.transformation.TransformSparkJobNode; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @ExtendWith(MockitoExtension.class) -public class AggregationJobTest extends AbstractVocabularyTest { +public class GenerateNativeStoreSparkJobTest extends AbstractVocabularyTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -58,18 +59,20 @@ public class AggregationJobTest extends AbstractVocabularyTest { private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']"; private static String provenance; - private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJobTest.class); @BeforeAll public static void beforeAll() throws IOException { provenance = IOUtils - .toString(AggregationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); - workingDir = Files.createTempDirectory(AggregationJobTest.class.getSimpleName()); + .toString( + GenerateNativeStoreSparkJobTest.class + .getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json")); + workingDir = Files.createTempDirectory(GenerateNativeStoreSparkJobTest.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); - conf.setAppName(AggregationJobTest.class.getSimpleName()); + conf.setAppName(GenerateNativeStoreSparkJobTest.class.getSimpleName()); conf.setMaster("local[*]"); conf.set("spark.driver.host", "localhost"); @@ -81,7 +84,7 @@ public class AggregationJobTest extends AbstractVocabularyTest { encoder = Encoders.bean(MetadataRecord.class); spark = SparkSession .builder() - .appName(AggregationJobTest.class.getSimpleName()) + .appName(GenerateNativeStoreSparkJobTest.class.getSimpleName()) .config(conf) .getOrCreate(); } @@ -202,6 +205,67 @@ public class AggregationJobTest extends AbstractVocabularyTest { } + @Test + public void testJSONSerialization() throws Exception { + final String s = IOUtils.toString(getClass().getResourceAsStream("mdStoreVersion_1.json")); + System.out.println("s = " + s); + final ObjectMapper mapper = new ObjectMapper(); + MDStoreVersion mi = mapper.readValue(s, MDStoreVersion.class); + + assertNotNull(mi); + + } + + @Test + public void testGenerationMetadataRecord() throws Exception { + + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + + assertNotNull(record.getId()); + assertNotNull(record.getOriginalId()); + } + + @Test + public void testEquals() throws IOException { + + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + + record.setBody("ciao"); + record1.setBody("mondo"); + + assertNotNull(record); + assertNotNull(record1); + assertEquals(record, record1); + } + protected void verify(MDStoreVersion mdStoreVersion) throws IOException { Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists()); @@ -226,7 +290,7 @@ public class AggregationJobTest extends AbstractVocabularyTest { Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal"); } - private MDStoreVersion prepareVersion(String filename) throws IOException { + public MDStoreVersion prepareVersion(String filename) throws IOException { MDStoreVersion mdstore = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class); mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString())); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java index 975ef944e..b5ea5f069 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/CollectorWorkerApplicationTests.java @@ -9,20 +9,10 @@ import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.collection.ApiDescriptor; -import eu.dnetlib.dhp.collection.worker.CollectorPluginFactory; -import eu.dnetlib.dhp.collection.worker.HttpClientParams; @Disabled public class CollectorWorkerApplicationTests { - @Test - public void testFindPlugin() throws Exception { - final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); - final HttpClientParams clientParams = new HttpClientParams(); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol(clientParams, "oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol(clientParams, "OAI")); - } - @Test public void testCollectionOAI() throws Exception { final ApiDescriptor api = new ApiDescriptor(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java index d665e5b5f..fd90a1b84 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java @@ -8,7 +8,7 @@ import java.io.IOException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import eu.dnetlib.dhp.collection.worker.CollectorPluginReport; +import eu.dnetlib.dhp.collection.CollectorPluginReport; public class CollectorPluginReportTest { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 997727e33..356fb252d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -27,7 +27,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.collection.CollectionJobTest; +import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJobTest; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -40,7 +40,7 @@ public class TransformationJobTest extends AbstractVocabularyTest { @BeforeAll public static void beforeAll() throws IOException, ISLookUpException { SparkConf conf = new SparkConf(); - conf.setAppName(CollectionJobTest.class.getSimpleName()); + conf.setAppName(GenerateNativeStoreSparkJobTest.class.getSimpleName()); conf.setMaster("local"); spark = SparkSession.builder().config(conf).getOrCreate(); } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json new file mode 100644 index 000000000..99957cac9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/apiDescriptor.json @@ -0,0 +1,10 @@ +{ + "id":"api_________::opendoar____::2::0", + "baseUrl":"https://www.alexandria.unisg.ch/cgi/oai2", + "protocol":"oai", + "params": { + "set":"driver", + "metadata_identifier_path":"//*[local-name()\u003d\u0027header\u0027]/*[local-name()\u003d\u0027identifier\u0027]", + "format":"oai_dc" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml index 1f5cf7b81..0352092b2 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/fields.xml @@ -79,6 +79,8 @@ + + diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml index b617dbea2..a0ca0aa6f 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/record.xml @@ -39,6 +39,8 @@ Saykally, Jessica N. Keeley, Kristen L. Haris Hatic + Baglioni, Miriam + De Bonis, Michele 2017-06-01 Withania somnifera has been used in traditional medicine for a variety of neural disorders. Recently, chronic neurodegenerative conditions have been @@ -115,7 +117,7 @@ Cell Transplantation - + Cell Transplantation From 5a9017cf18860ddcd3588e62804826b124463333 Mon Sep 17 00:00:00 2001 From: Andreas Czerniak Date: Fri, 12 Feb 2021 14:32:36 +0100 Subject: [PATCH 46/86] clone, min. changes, test, run --- .../main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java | 2 +- .../src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl | 2 +- .../src/test/resources/eu/dnetlib/dhp/transform/tr.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 7b0fdd484..1343a99b9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -16,7 +16,7 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public QName getName() { - return new QName("http://eu/dnetlib/trasform/extension", "clean"); + return new QName("http://eu/dnetlib/transform/extension", "clean"); } @Override diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index becd3a05e..aed3de656 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -1,7 +1,7 @@ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml index a9eae8576..77fccb4d3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml @@ -16,7 +16,7 @@ From f216277219a164ddb12b9c99610a368eac0212c1 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 12 Feb 2021 16:34:52 +0100 Subject: [PATCH 47/86] Implemented cleaning date --- .../dhp/transformation/xslt/Cleaner.java | 2 +- .../dhp/transformation/xslt/DateCleaner.java | 100 ++++++++++++++++++ .../xslt/XSLTTransformationFunction.java | 1 + .../transformation/TransformationJobTest.java | 30 ++++-- .../eu/dnetlib/dhp/transform/input_zenodo.xml | 2 +- .../eu/dnetlib/dhp/transform/zenodo_tr.xslt | 7 +- 6 files changed, 128 insertions(+), 14 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 8b7024cbe..fbf47c2ff 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -35,7 +35,7 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { XdmValue r = xdmValues[0]; - if (r.size() == 0){ + if (r.size() == 0) { return new XdmAtomicValue(""); } final String currentValue = xdmValues[0].itemAt(0).getStringValue(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java new file mode 100644 index 000000000..98bea8de4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dhp.transformation.xslt; + +import net.sf.saxon.s9api.*; +import scala.Serializable; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class DateCleaner implements ExtensionFunction, Serializable { + + private final static List dateRegex = Arrays.asList( + //Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + //M-D-Y + Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), + //D-M-Y + Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ); + + private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); + + private final static List dformats = Arrays.asList( + DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH), + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + ); + + public String clean(final String inputDate) { + + Optional cleanedDate = dateRegex.stream().map( + p -> { + final Matcher matcher = p.matcher(inputDate); + if (matcher.find()) + return matcher.group(0); + else + return null; + } + ).filter(Objects::nonNull) + .map(m -> { + Optional cleanDate = dformats.stream() + .map(f -> { + try { + LocalDate parsedDate = LocalDate.parse(m, f); + if (parsedDate != null) + return parsedDate.toString(); + else + return null; + } catch (Throwable e) { + return null; + } + } + + ).filter(Objects::nonNull).findAny(); + + return cleanDate.orElse(null); + }).filter(Objects::nonNull).findAny(); + + if (cleanedDate.isPresent()) + return cleanedDate.get(); + + final Matcher matcher = incompleteDateRegex.matcher(inputDate); + if (matcher.find()){ + final Integer year = Integer.parseInt(matcher.group(1)); + final Integer month = Integer.parseInt(matcher.group(4) == null ? "01":matcher.group(4)); + return String.format("%d-%02d-01",year, month); + } + return null; + } + + @Override + public QName getName() { + return new QName("http://eu/dnetlib/trasform/dates", "dateISO"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index d8707cd76..d37832bb4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -41,6 +41,7 @@ public class XSLTTransformationFunction implements MapFunctionADCP - 2019-05-29 + 2019 diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt index e67ed9dda..23e57579b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt @@ -4,8 +4,9 @@ xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:vocabulary="http://eu/dnetlib/trasform/extension" + xmlns:dateCleaner="http://eu/dnetlib/trasform/dates" xmlns:dr="http://www.driver-repository.eu/namespace/dr" - exclude-result-prefixes="xsl vocabulary"> + exclude-result-prefixes="xsl vocabulary dateCleaner"> @@ -53,7 +54,7 @@ + select="dateCleaner:dateISO(normalize-space(//*[local-name()='date'][@dateType='Available']))"/> @@ -112,7 +113,7 @@ + select="dateCleaner:dateISO(normalize-space(//*[local-name()='publicationYear']))"/> From 6a37c7f175eb7efc7e2d16fb6f10830851ae1b57 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 12 Feb 2021 16:38:47 +0100 Subject: [PATCH 48/86] merge fixed --- .../eu/dnetlib/dhp/transformation/TransformationJobTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index df71a513b..b76f9bce6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -28,7 +28,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.collection.CollectionJobTest; + import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -41,7 +41,7 @@ public class TransformationJobTest extends AbstractVocabularyTest { @BeforeAll public static void beforeAll() throws IOException, ISLookUpException { SparkConf conf = new SparkConf(); - conf.setAppName(CollectionJobTest.class.getSimpleName()); + conf.setAppName(TransformationJobTest.class.getSimpleName()); conf.setMaster("local"); spark = SparkSession.builder().config(conf).getOrCreate(); } From 7edcc87ed4689b41593a0ac5382fc62f59ac7fe7 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 12 Feb 2021 17:27:08 +0100 Subject: [PATCH 49/86] changed xslt behaviour on failure --- .../transformation/TransformSparkJobNode.java | 3 +- .../dhp/transformation/xslt/DateCleaner.java | 170 ++++++++++-------- .../xslt/XSLTTransformationFunction.java | 2 +- .../transformation/TransformationJobTest.java | 19 +- 4 files changed, 103 insertions(+), 91 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index e628e7645..f9a18987d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -106,7 +106,8 @@ public class TransformSparkJobNode { log.info("Transformation Error item " + ct.getErrorItems().count()); writeHdfsFile( - spark.sparkContext().hadoopConfiguration(), "" + mdstore.count(), outputBasePath + MDSTORE_SIZE_PATH); + spark.sparkContext().hadoopConfiguration(), + "" + spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(), outputBasePath + MDSTORE_SIZE_PATH); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java index 98bea8de4..4e1a29b52 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -1,100 +1,118 @@ + package eu.dnetlib.dhp.transformation.xslt; -import net.sf.saxon.s9api.*; -import scala.Serializable; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import net.sf.saxon.s9api.*; +import scala.Serializable; public class DateCleaner implements ExtensionFunction, Serializable { - private final static List dateRegex = Arrays.asList( - //Y-M-D - Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), - //M-D-Y - Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE), - //D-M-Y - Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE), - //Y - Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) - ); + private final static List dateRegex = Arrays + .asList( + // Y-M-D + Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE), + // M-D-Y + Pattern + .compile( + "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", + Pattern.MULTILINE), + // D-M-Y + Pattern + .compile( + "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", + Pattern.MULTILINE), + // Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)); - private final static Pattern incompleteDateRegex = Pattern.compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); + private final static Pattern incompleteDateRegex = Pattern + .compile("^((18|19|20)\\d\\d){1}([- \\\\ \\/](0?[1-9]|1[012]))?", Pattern.MULTILINE); - private final static List dformats = Arrays.asList( - DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH), - DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) - ); + private final static List dformats = Arrays + .asList( + DateTimeFormatter + .ofPattern( + "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", + Locale.ENGLISH), + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)); - public String clean(final String inputDate) { + public String clean(final String inputDate) { - Optional cleanedDate = dateRegex.stream().map( - p -> { - final Matcher matcher = p.matcher(inputDate); - if (matcher.find()) - return matcher.group(0); - else - return null; - } - ).filter(Objects::nonNull) - .map(m -> { - Optional cleanDate = dformats.stream() - .map(f -> { - try { - LocalDate parsedDate = LocalDate.parse(m, f); - if (parsedDate != null) - return parsedDate.toString(); - else - return null; - } catch (Throwable e) { - return null; - } - } + Optional cleanedDate = dateRegex + .stream() + .map( + p -> { + final Matcher matcher = p.matcher(inputDate); + if (matcher.find()) + return matcher.group(0); + else + return null; + }) + .filter(Objects::nonNull) + .map(m -> { + Optional cleanDate = dformats + .stream() + .map(f -> { + try { + LocalDate parsedDate = LocalDate.parse(m, f); + if (parsedDate != null) + return parsedDate.toString(); + else + return null; + } catch (Throwable e) { + return null; + } + } - ).filter(Objects::nonNull).findAny(); + ) + .filter(Objects::nonNull) + .findAny(); - return cleanDate.orElse(null); - }).filter(Objects::nonNull).findAny(); + return cleanDate.orElse(null); + }) + .filter(Objects::nonNull) + .findAny(); - if (cleanedDate.isPresent()) - return cleanedDate.get(); + if (cleanedDate.isPresent()) + return cleanedDate.get(); - final Matcher matcher = incompleteDateRegex.matcher(inputDate); - if (matcher.find()){ - final Integer year = Integer.parseInt(matcher.group(1)); - final Integer month = Integer.parseInt(matcher.group(4) == null ? "01":matcher.group(4)); - return String.format("%d-%02d-01",year, month); - } - return null; - } + final Matcher matcher = incompleteDateRegex.matcher(inputDate); + if (matcher.find()) { + final Integer year = Integer.parseInt(matcher.group(1)); + final Integer month = Integer.parseInt(matcher.group(4) == null ? "01" : matcher.group(4)); + return String.format("%d-%02d-01", year, month); + } + return null; + } - @Override - public QName getName() { - return new QName("http://eu/dnetlib/trasform/dates", "dateISO"); - } + @Override + public QName getName() { + return new QName("http://eu/dnetlib/trasform/dates", "dateISO"); + } - @Override - public SequenceType getResultType() { - return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); - } + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) - }; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } - @Override - public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { - XdmValue r = xdmValues[0]; - if (r.size() == 0) { - return new XdmAtomicValue(""); - } - final String currentValue = xdmValues[0].itemAt(0).getStringValue(); - return new XdmAtomicValue(clean(currentValue)); - } + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index d37832bb4..7d47cc84d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -63,7 +63,7 @@ public class XSLTTransformationFunction implements MapFunction Date: Mon, 15 Feb 2021 15:08:59 +0100 Subject: [PATCH 50/86] WIP: collectorWorker error reporting, added report messages --- .../java/eu/dnetlib/dhp/message/Message.java | 22 ++++--- .../eu/dnetlib/dhp/message/MessageSender.java | 11 ++-- .../eu/dnetlib/dhp/message/MessageType.java | 20 ++++++ .../dhp/collection/CollectorPluginReport.java | 38 +++--------- .../dhp/collection/CollectorWorker.java | 10 +-- .../CollectorWorkerApplication.java | 15 ++--- .../collection/CollectorWorkerReporter.java | 62 ------------------- .../collector_reporter_input_parameter.json | 14 ----- .../dhp/collection/oozie_app/workflow.xml | 12 ---- .../utils/CollectorPluginReportTest.java | 30 --------- 10 files changed, 57 insertions(+), 177 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json delete mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index ed2a3c9b3..0cbb6c859 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -3,31 +3,36 @@ package eu.dnetlib.dhp.message; import java.io.Serializable; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; public class Message implements Serializable { + private static final long serialVersionUID = 401753881204524893L; + public static String CURRENT_PARAM = "current"; public static String TOTAL_PARAM = "total"; - /** - * - */ - private static final long serialVersionUID = 401753881204524893L; + private MessageType messageType; private String workflowId; private Map body; - public Message() { - body = new HashMap<>(); + public Message(final MessageType messageType, final String workflowId) { + this(messageType, workflowId, new LinkedHashMap<>()); } - public Message(final String workflowId, final Map body) { + public Message(final MessageType messageType, final String workflowId, final Map body) { + this.messageType = messageType; this.workflowId = workflowId; this.body = body; } + public MessageType getMessageType() { + return messageType; + } + public String getWorkflowId() { return workflowId; } @@ -46,6 +51,7 @@ public class Message implements Serializable { @Override public String toString() { - return String.format("Message [workflowId=%s, body=%s]", workflowId, body); + return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body); } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java index 16bb0c97e..0c6eacf99 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.message; +import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -45,13 +46,15 @@ public class MessageSender { } public void sendMessage(final Long current, final Long total) { - sendMessage(createMessage(current, total)); + sendMessage(createOngoingMessage(current, total)); } - private Message createMessage(final Long current, final Long total) { + public void sendReport(final Map report) { + sendMessage(new Message(MessageType.REPORT, workflowId, report)); + } - final Message m = new Message(); - m.setWorkflowId(workflowId); + private Message createOngoingMessage(final Long current, final Long total) { + final Message m = new Message(MessageType.ONGOING, workflowId); m.getBody().put(Message.CURRENT_PARAM, current.toString()); if (total != null) { m.getBody().put(Message.TOTAL_PARAM, total.toString()); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java new file mode 100644 index 000000000..30f152c96 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java @@ -0,0 +1,20 @@ + +package eu.dnetlib.dhp.message; + +import java.util.Optional; + +import org.apache.commons.lang3.StringUtils; + +public enum MessageType { + + ONGOING, REPORT; + + public MessageType from(String value) { + return Optional + .ofNullable(value) + .map(StringUtils::upperCase) + .map(MessageType::valueOf) + .orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value)); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java index a7204523a..a10572d06 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java @@ -1,57 +1,39 @@ package eu.dnetlib.dhp.collection; -import static eu.dnetlib.dhp.utils.DHPUtils.*; - import java.io.Closeable; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Objects; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.annotation.JsonIgnore; +import eu.dnetlib.dhp.message.MessageSender; public class CollectorPluginReport extends LinkedHashMap implements Closeable { private static final Logger log = LoggerFactory.getLogger(CollectorPluginReport.class); - @JsonIgnore - private Path path; - - @JsonIgnore - private FSDataOutputStream fos; - - public static String SUCCESS = "success"; + private MessageSender messageSender; public CollectorPluginReport() { } - public CollectorPluginReport(FileSystem fs, Path path) throws IOException { - this.path = path; - this.fos = fs.create(path); + public CollectorPluginReport(MessageSender messageSender) throws IOException { + this.messageSender = messageSender; } - public Boolean isSuccess() { - return containsKey(SUCCESS) && Boolean.valueOf(get(SUCCESS)); - } - - public void setSuccess(Boolean success) { - put(SUCCESS, String.valueOf(success)); + public void ongoing(Long current, Long total) { + messageSender.sendMessage(current, total); } @Override public void close() throws IOException { - final String data = MAPPER.writeValueAsString(this); - if (Objects.nonNull(fos)) { - log.info("writing report {} to {}", data, path.toString()); - IOUtils.write(data, fos); - populateOOZIEEnv(this); + if (Objects.nonNull(messageSender)) { + log.info("closing report: "); + this.forEach((k, v) -> log.info("{} - {}", k, v)); + messageSender.sendReport(this); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index ace725bfd..04e0f70c4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -38,20 +38,16 @@ public class CollectorWorker { private final CollectorPluginReport report; - private final MessageSender messageSender; - public CollectorWorker( final ApiDescriptor api, final FileSystem fileSystem, final MDStoreVersion mdStoreVersion, final HttpClientParams clientParams, - final MessageSender messageSender, final CollectorPluginReport report) { this.api = api; this.fileSystem = fileSystem; this.mdStoreVersion = mdStoreVersion; this.clientParams = clientParams; - this.messageSender = messageSender; this.report = report; } @@ -78,23 +74,21 @@ public class CollectorWorker { content -> { key.set(counter.getAndIncrement()); if (counter.get() % 500 == 0) - messageSender.sendMessage(counter.longValue(), null); + report.ongoing(counter.longValue(), null); value.set(content); try { writer.append(key, value); } catch (Throwable e) { report.put(e.getClass().getName(), e.getMessage()); log.warn("setting report to failed"); - report.setSuccess(false); throw new RuntimeException(e); } }); } catch (Throwable e) { report.put(e.getClass().getName(), e.getMessage()); log.warn("setting report to failed"); - report.setSuccess(false); } finally { - messageSender.sendMessage(counter.longValue(), counter.longValue()); + report.ongoing(counter.longValue(), counter.longValue()); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java index 0eea0837c..15f3f20b5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java @@ -77,22 +77,15 @@ public class CollectorWorkerApplication { } protected void run(String mdStoreVersion, HttpClientParams clientParams, ApiDescriptor api, - String dnetMessageManagerURL, String workflowId) throws IOException { + String dnetMessageManagerURL, String workflowId) + throws IOException, CollectorException, UnknownCollectorPluginException { final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId); final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); - final String reportPath = currentVersion.getHdfsPath() + REPORT_FILE_NAME; - log.info("report path is {}", reportPath); - - try (CollectorPluginReport report = new CollectorPluginReport(fileSystem, new Path(reportPath))) { - final CollectorWorker worker = new CollectorWorker(api, fileSystem, currentVersion, clientParams, ms, - report); - worker.collect(); - report.setSuccess(true); - } catch (Throwable e) { - log.info("got exception {}, ignoring", e.getMessage()); + try (CollectorPluginReport report = new CollectorPluginReport(ms)) { + new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect(); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java deleted file mode 100644 index d8cf3ec02..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerReporter.java +++ /dev/null @@ -1,62 +0,0 @@ - -package eu.dnetlib.dhp.collection; - -import static eu.dnetlib.dhp.common.Constants.REPORT_FILE_NAME; -import static eu.dnetlib.dhp.utils.DHPUtils.*; - -import java.io.IOException; -import java.util.Objects; - -import org.apache.commons.cli.ParseException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; - -/** - * CollectorWorkerReporter - */ -public class CollectorWorkerReporter { - - private static final Logger log = LoggerFactory.getLogger(CollectorWorkerReporter.class); - - /** - * @param args - */ - public static void main(final String[] args) throws IOException, ParseException, CollectorException { - - final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( - IOUtils - .toString( - CollectorWorker.class - .getResourceAsStream( - "/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json"))); - argumentParser.parseArgument(args); - - final String nameNode = argumentParser.get("namenode"); - log.info("nameNode is {}", nameNode); - - final String mdStoreVersion = argumentParser.get("mdStoreVersion"); - log.info("mdStoreVersion is {}", mdStoreVersion); - - final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); - - final String reportPath = currentVersion.getHdfsPath() + REPORT_FILE_NAME; - log.info("report path is {}", reportPath); - - final Configuration conf = getHadoopConfiguration(nameNode); - CollectorPluginReport report = readHdfsFileAs(conf, reportPath, CollectorPluginReport.class); - if (Objects.isNull(report)) { - throw new CollectorException("collection report is NULL"); - } - log.info("report success: {}, size: {}", report.isSuccess(), report.size()); - report.forEach((k, v) -> log.info("{} - {}", k, v)); - if (!report.isSuccess()) { - throw new CollectorException("collection report indicates a failure"); - } - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json deleted file mode 100644 index ef65cc389..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_reporter_input_parameter.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "paramName": "n", - "paramLongName": "namenode", - "paramDescription": "the Name Node URI", - "paramRequired": true - }, - { - "paramName": "mv", - "paramLongName": "mdStoreVersion", - "paramDescription": "the MDStore Version bean", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 1bab59659..0678eed11 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -110,18 +110,6 @@ --retryDelay${retryDelay} --connectTimeOut${connectTimeOut} --readTimeOut${readTimeOut} - - - - - - - - - eu.dnetlib.dhp.collection.CollectorWorkerReporter - ${collection_java_xmx} - --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} - --namenode${nameNode} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java deleted file mode 100644 index fd90a1b84..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/utils/CollectorPluginReportTest.java +++ /dev/null @@ -1,30 +0,0 @@ - -package eu.dnetlib.dhp.collector.worker.utils; - -import static eu.dnetlib.dhp.utils.DHPUtils.*; - -import java.io.IOException; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import eu.dnetlib.dhp.collection.CollectorPluginReport; - -public class CollectorPluginReportTest { - - @Test - public void testSerialize() throws IOException { - CollectorPluginReport r1 = new CollectorPluginReport(); - r1.put("a", "b"); - r1.setSuccess(true); - - String s = MAPPER.writeValueAsString(r1); - - Assertions.assertNotNull(s); - - CollectorPluginReport r2 = MAPPER.readValue(s, CollectorPluginReport.class); - - Assertions.assertTrue(r2.isSuccess(), "should be true"); - } - -} From 58288a95b8f3eec2a09c677a33c7a9df74c4f250 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 15 Feb 2021 15:28:53 +0100 Subject: [PATCH 51/86] WIP: collectorWorker error reporting, added report messages --- .../src/main/java/eu/dnetlib/dhp/message/Message.java | 6 ++++++ .../src/main/java/eu/dnetlib/dhp/message/MessageType.java | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index 0cbb6c859..ecccf8a43 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -19,6 +19,8 @@ public class Message implements Serializable { private Map body; + public Message() {} + public Message(final MessageType messageType, final String workflowId) { this(messageType, workflowId, new LinkedHashMap<>()); } @@ -33,6 +35,10 @@ public class Message implements Serializable { return messageType; } + public void setMessageType(MessageType messageType) { + this.messageType = messageType; + } + public String getWorkflowId() { return workflowId; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java index 30f152c96..75ffb8ef5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java @@ -1,11 +1,12 @@ package eu.dnetlib.dhp.message; +import java.io.Serializable; import java.util.Optional; import org.apache.commons.lang3.StringUtils; -public enum MessageType { +public enum MessageType implements Serializable { ONGOING, REPORT; From cf27905a71070c5ea406744d498844714de495f7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 16 Feb 2021 16:53:14 +0100 Subject: [PATCH 52/86] WIP: collectorWorker error reporting, added report messages --- .../java/eu/dnetlib/dhp/message/Message.java | 3 ++- .../dhp/collection/CollectorPluginReport.java | 10 +++++++- .../dhp/collection/CollectorWorker.java | 19 ++++++++++----- .../collection/plugin/oai/OaiIterator.java | 24 +++++++++---------- .../plugin/oai/OaiIteratorFactory.java | 5 ++-- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java index ecccf8a43..f1107b4b8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -19,7 +19,8 @@ public class Message implements Serializable { private Map body; - public Message() {} + public Message() { + } public Message(final MessageType messageType, final String workflowId) { this(messageType, workflowId, new LinkedHashMap<>()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java index a10572d06..d8f167d49 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java @@ -3,12 +3,17 @@ package eu.dnetlib.dhp.collection; import java.io.Closeable; import java.io.IOException; +import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; import java.util.Objects; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Joiner; +import com.google.gson.Gson; + import eu.dnetlib.dhp.message.MessageSender; public class CollectorPluginReport extends LinkedHashMap implements Closeable { @@ -33,7 +38,10 @@ public class CollectorPluginReport extends LinkedHashMap impleme if (Objects.nonNull(messageSender)) { log.info("closing report: "); this.forEach((k, v) -> log.info("{} - {}", k, v)); - messageSender.sendReport(this); + + Map m = new HashMap<>(); + m.put(getClass().getSimpleName().toLowerCase(), new Gson().toJson(values())); + messageSender.sendReport(m); } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 04e0f70c4..154b50414 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -5,6 +5,8 @@ import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME; import java.io.IOException; import java.util.Optional; +import java.util.Timer; +import java.util.TimerTask; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.lang3.StringUtils; @@ -22,11 +24,11 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -import eu.dnetlib.dhp.message.MessageSender; public class CollectorWorker { private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class); + public static final int ONGOING_REPORT_FREQUENCY_MS = 5000; private final ApiDescriptor api; @@ -59,6 +61,14 @@ public class CollectorWorker { final CollectorPlugin plugin = getCollectorPlugin(); final AtomicInteger counter = new AtomicInteger(0); + final Timer timer = new Timer(); + timer.schedule(new TimerTask() { + @Override + public void run() { + report.ongoing(counter.longValue(), null); + } + }, 5000, ONGOING_REPORT_FREQUENCY_MS); + try (SequenceFile.Writer writer = SequenceFile .createWriter( fileSystem.getConf(), @@ -73,21 +83,18 @@ public class CollectorWorker { .forEach( content -> { key.set(counter.getAndIncrement()); - if (counter.get() % 500 == 0) - report.ongoing(counter.longValue(), null); value.set(content); try { writer.append(key, value); } catch (Throwable e) { - report.put(e.getClass().getName(), e.getMessage()); - log.warn("setting report to failed"); throw new RuntimeException(e); } }); } catch (Throwable e) { report.put(e.getClass().getName(), e.getMessage()); - log.warn("setting report to failed"); + throw new CollectorException(e); } finally { + timer.cancel(); report.ongoing(counter.longValue(), counter.longValue()); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index edfcb7bb5..0a0a4c734 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -38,7 +38,7 @@ public class OaiIterator implements Iterator { private String token; private boolean started; private final HttpConnector2 httpConnector; - private CollectorPluginReport errorLogList; + private CollectorPluginReport report; public OaiIterator( final String baseUrl, @@ -47,7 +47,7 @@ public class OaiIterator implements Iterator { final String fromDate, final String untilDate, final HttpConnector2 httpConnector, - final CollectorPluginReport errorLogList) { + final CollectorPluginReport report) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; this.set = set; @@ -55,7 +55,7 @@ public class OaiIterator implements Iterator { this.untilDate = untilDate; this.started = false; this.httpConnector = httpConnector; - this.errorLogList = errorLogList; + this.report = report; } private void verifyStarted() { @@ -113,7 +113,7 @@ public class OaiIterator implements Iterator { return downloadPage(url); } catch (final UnsupportedEncodingException e) { - errorLogList.put(e.getClass().getName(), e.getMessage()); + report.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e); } } @@ -139,27 +139,27 @@ public class OaiIterator implements Iterator { + "?verb=ListRecords&resumptionToken=" + URLEncoder.encode(resumptionToken, "UTF-8")); } catch (final UnsupportedEncodingException e) { - errorLogList.put(e.getClass().getName(), e.getMessage()); + report.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e); } } private String downloadPage(final String url) throws CollectorException { - final String xml = httpConnector.getInputSource(url, errorLogList); + final String xml = httpConnector.getInputSource(url, report); Document doc; try { doc = reader.read(new StringReader(xml)); } catch (final DocumentException e) { log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); - errorLogList.put(e.getClass().getName(), e.getMessage()); + report.put(e.getClass().getName(), e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { doc = reader.read(new StringReader(cleaned)); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { - errorLogList.put(e1.getClass().getName(), e1.getMessage()); + report.put(e1.getClass().getName(), e1.getMessage()); throw new CollectorException("Error parsing cleaned document:\n" + cleaned, e1); } return resumptionToken; @@ -172,11 +172,11 @@ public class OaiIterator implements Iterator { if ("noRecordsMatch".equalsIgnoreCase(code)) { final String msg = "noRecordsMatch for oai call : " + url; log.warn(msg); - errorLogList.put(REPORT_PREFIX + code, msg); + report.put(REPORT_PREFIX + code, msg); return null; } else { final String msg = code + " - " + errorNode.getText(); - errorLogList.put(REPORT_PREFIX + "error", msg); + report.put(REPORT_PREFIX + "error", msg); throw new CollectorException(msg); } } @@ -188,7 +188,7 @@ public class OaiIterator implements Iterator { return doc.valueOf("//*[local-name()='resumptionToken']"); } - public CollectorPluginReport getErrorLogList() { - return errorLogList; + public CollectorPluginReport getReport() { + return report; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index d7b5de087..a72a62f13 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -18,9 +18,8 @@ public class OaiIteratorFactory { final String fromDate, final String untilDate, final HttpClientParams clientParams, - final CollectorPluginReport errorLogList) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), - errorLogList); + final CollectorPluginReport report) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report); } private HttpConnector2 getHttpConnector(HttpClientParams clientParams) { From b592d78bb4dafdf2a2f3eac99ef2f511bf41863c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Feb 2021 10:28:01 +0100 Subject: [PATCH 53/86] WIP: collectorWorker error reporting, generalised reported implementation --- .../common/AggregatorReport.java} | 11 ++-- .../aggregation/common/ReporterCallback.java | 10 ++++ .../dhp/aggregation/common/ReportingJob.java | 41 +++++++++++++ .../dhp/collection/CollectorWorker.java | 59 +++++++++++-------- .../CollectorWorkerApplication.java | 7 +-- .../dhp/collection/HttpConnector2.java | 14 +++-- .../collection/plugin/CollectorPlugin.java | 13 +++- .../mongodb/MongoDbCollectorPlugin.java | 4 +- .../mongodb/MongoDbDumpCollectorPlugin.java | 4 +- .../plugin/oai/OaiCollectorPlugin.java | 4 +- .../collection/plugin/oai/OaiIterator.java | 8 +-- .../plugin/oai/OaiIteratorFactory.java | 4 +- 12 files changed, 123 insertions(+), 56 deletions(-) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/{collection/CollectorPluginReport.java => aggregation/common/AggregatorReport.java} (69%) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java similarity index 69% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java index d8f167d49..269f8f6e9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorPluginReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.aggregation.common; import java.io.Closeable; import java.io.IOException; @@ -11,21 +11,20 @@ import java.util.Objects; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.base.Joiner; import com.google.gson.Gson; import eu.dnetlib.dhp.message.MessageSender; -public class CollectorPluginReport extends LinkedHashMap implements Closeable { +public class AggregatorReport extends LinkedHashMap implements Closeable { - private static final Logger log = LoggerFactory.getLogger(CollectorPluginReport.class); + private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class); private MessageSender messageSender; - public CollectorPluginReport() { + public AggregatorReport() { } - public CollectorPluginReport(MessageSender messageSender) throws IOException { + public AggregatorReport(MessageSender messageSender) throws IOException { this.messageSender = messageSender; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java new file mode 100644 index 000000000..b289b6e07 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReporterCallback.java @@ -0,0 +1,10 @@ + +package eu.dnetlib.dhp.aggregation.common; + +public interface ReporterCallback { + + Long getCurrent(); + + Long getTotal(); + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java new file mode 100644 index 000000000..791226034 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/ReportingJob.java @@ -0,0 +1,41 @@ + +package eu.dnetlib.dhp.aggregation.common; + +import java.util.TimerTask; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +public abstract class ReportingJob { + + /** + * Frequency (seconds) for sending ongoing messages to report the collection task advancement + */ + public static final int ONGOING_REPORT_FREQUENCY = 5; + + /** + * Initial delay (seconds) for sending ongoing messages to report the collection task advancement + */ + public static final int INITIAL_DELAY = 2; + + private ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(); + + protected final AggregatorReport report; + + public ReportingJob(AggregatorReport report) { + this.report = report; + } + + protected void schedule(final ReporterCallback callback) { + executor.scheduleAtFixedRate(new TimerTask() { + @Override + public void run() { + report.ongoing(callback.getCurrent(), callback.getTotal()); + } + }, INITIAL_DELAY, ONGOING_REPORT_FREQUENCY, TimeUnit.SECONDS); + } + + protected void shutdown() { + executor.shutdown(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 154b50414..a397c4f9d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -5,11 +5,8 @@ import static eu.dnetlib.dhp.common.Constants.SEQUENCE_FILE_NAME; import java.io.IOException; import java.util.Optional; -import java.util.Timer; -import java.util.TimerTask; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; @@ -20,15 +17,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.aggregation.common.ReporterCallback; +import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -public class CollectorWorker { +public class CollectorWorker extends ReportingJob { private static final Logger log = LoggerFactory.getLogger(CollectorWorker.class); - public static final int ONGOING_REPORT_FREQUENCY_MS = 5000; private final ApiDescriptor api; @@ -38,19 +37,17 @@ public class CollectorWorker { private final HttpClientParams clientParams; - private final CollectorPluginReport report; - public CollectorWorker( final ApiDescriptor api, final FileSystem fileSystem, final MDStoreVersion mdStoreVersion, final HttpClientParams clientParams, - final CollectorPluginReport report) { + final AggregatorReport report) { + super(report); this.api = api; this.fileSystem = fileSystem; this.mdStoreVersion = mdStoreVersion; this.clientParams = clientParams; - this.report = report; } public void collect() throws UnknownCollectorPluginException, CollectorException, IOException { @@ -61,13 +58,7 @@ public class CollectorWorker { final CollectorPlugin plugin = getCollectorPlugin(); final AtomicInteger counter = new AtomicInteger(0); - final Timer timer = new Timer(); - timer.schedule(new TimerTask() { - @Override - public void run() { - report.ongoing(counter.longValue(), null); - } - }, 5000, ONGOING_REPORT_FREQUENCY_MS); + scheduleReport(counter); try (SequenceFile.Writer writer = SequenceFile .createWriter( @@ -94,30 +85,46 @@ public class CollectorWorker { report.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e); } finally { - timer.cancel(); + shutdown(); report.ongoing(counter.longValue(), counter.longValue()); } } + private void scheduleReport(AtomicInteger counter) { + schedule(new ReporterCallback() { + @Override + public Long getCurrent() { + return counter.longValue(); + } + + @Override + public Long getTotal() { + return null; + } + }); + } + private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException { - switch (StringUtils.lowerCase(StringUtils.trim(api.getProtocol()))) { - case "oai": + + switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) { + case oai: return new OaiCollectorPlugin(clientParams); - case "other": - final String plugin = Optional + case other: + final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(api.getParams().get("other_plugin_type")) - .orElseThrow(() -> new UnknownCollectorPluginException("other_plugin_type")); + .map(CollectorPlugin.NAME.OTHER_NAME::valueOf) + .get(); switch (plugin) { - case "mdstore_mongodb_dump": + case mdstore_mongodb_dump: return new MongoDbDumpCollectorPlugin(fileSystem); - case "mdstore_mongodb": + case mdstore_mongodb: return new MongoDbCollectorPlugin(); default: - throw new UnknownCollectorPluginException("Unknown plugin type: " + plugin); + throw new UnknownCollectorPluginException("plugin is not managed: " + plugin); } default: - throw new UnknownCollectorPluginException("Unknown protocol: " + api.getProtocol()); + throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol()); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java index 15f3f20b5..2c5640499 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorkerApplication.java @@ -10,11 +10,11 @@ import java.util.Optional; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.message.MessageSender; @@ -80,11 +80,10 @@ public class CollectorWorkerApplication { String dnetMessageManagerURL, String workflowId) throws IOException, CollectorException, UnknownCollectorPluginException { + final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); final MessageSender ms = new MessageSender(dnetMessageManagerURL, workflowId); - final MDStoreVersion currentVersion = MAPPER.readValue(mdStoreVersion, MDStoreVersion.class); - - try (CollectorPluginReport report = new CollectorPluginReport(ms)) { + try (AggregatorReport report = new AggregatorReport(ms)) { new CollectorWorker(api, fileSystem, currentVersion, clientParams, report).collect(); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java index 72a2a70a2..ddf9efa36 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java @@ -15,6 +15,8 @@ import org.apache.http.HttpHeaders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; + /** * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java * @@ -42,17 +44,17 @@ public class HttpConnector2 { } /** - * @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport) + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) */ public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { return IOUtils.toInputStream(getInputSource(requestUrl)); } /** - * @see HttpConnector2#getInputSource(java.lang.String, CollectorPluginReport) + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) */ public String getInputSource(final String requestUrl) throws CollectorException { - return attemptDownloadAsString(requestUrl, 1, new CollectorPluginReport()); + return attemptDownloadAsString(requestUrl, 1, new AggregatorReport()); } /** @@ -63,13 +65,13 @@ public class HttpConnector2 { * @return the content of the downloaded resource * @throws CollectorException when retrying more than maxNumberOfRetry times */ - public String getInputSource(final String requestUrl, CollectorPluginReport report) + public String getInputSource(final String requestUrl, AggregatorReport report) throws CollectorException { return attemptDownloadAsString(requestUrl, 1, report); } private String attemptDownloadAsString(final String requestUrl, final int retryNumber, - final CollectorPluginReport report) throws CollectorException { + final AggregatorReport report) throws CollectorException { try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) { return IOUtils.toString(s); @@ -80,7 +82,7 @@ public class HttpConnector2 { } private InputStream attemptDownload(final String requestUrl, final int retryNumber, - final CollectorPluginReport report) throws CollectorException, IOException { + final AggregatorReport report) throws CollectorException, IOException { if (retryNumber > getClientParams().getMaxNumberOfRetry()) { final String msg = String diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 0a4b3a892..0ed6be5fa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -3,12 +3,21 @@ package eu.dnetlib.dhp.collection.plugin; import java.util.stream.Stream; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.CollectorPluginReport; public interface CollectorPlugin { - Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException; + enum NAME { + oai, other; + + public enum OTHER_NAME { + mdstore_mongodb_dump, mdstore_mongodb + } + + } + + Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java index 7d1952f9c..89b92ffa1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java @@ -13,9 +13,9 @@ import com.mongodb.MongoClient; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.CollectorPluginReport; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; public class MongoDbCollectorPlugin implements CollectorPlugin { @@ -26,7 +26,7 @@ public class MongoDbCollectorPlugin implements CollectorPlugin { public static final String MONGODB_DBNAME = "mongodb_dbname"; @Override - public Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException { + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { final String host = Optional .ofNullable(api.getParams().get(MONGODB_HOST)) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java index d08732593..3199af5b7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbDumpCollectorPlugin.java @@ -12,9 +12,9 @@ import java.util.zip.GZIPInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.CollectorPluginReport; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.utils.DHPUtils; @@ -30,7 +30,7 @@ public class MongoDbDumpCollectorPlugin implements CollectorPlugin { } @Override - public Stream collect(ApiDescriptor api, CollectorPluginReport report) throws CollectorException { + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { final Path path = Optional .ofNullable(api.getParams().get("path")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 8efdeb838..4600562ca 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -13,9 +13,9 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.CollectorPluginReport; import eu.dnetlib.dhp.collection.HttpClientParams; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; @@ -35,7 +35,7 @@ public class OaiCollectorPlugin implements CollectorPlugin { } @Override - public Stream collect(final ApiDescriptor api, final CollectorPluginReport report) + public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { final String baseUrl = api.getBaseUrl(); final String mdFormat = api.getParams().get(FORMAT_PARAM); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 0a0a4c734..887027f21 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -16,8 +16,8 @@ import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.CollectorPluginReport; import eu.dnetlib.dhp.collection.HttpConnector2; import eu.dnetlib.dhp.collection.XmlCleaner; @@ -38,7 +38,7 @@ public class OaiIterator implements Iterator { private String token; private boolean started; private final HttpConnector2 httpConnector; - private CollectorPluginReport report; + private AggregatorReport report; public OaiIterator( final String baseUrl, @@ -47,7 +47,7 @@ public class OaiIterator implements Iterator { final String fromDate, final String untilDate, final HttpConnector2 httpConnector, - final CollectorPluginReport report) { + final AggregatorReport report) { this.baseUrl = baseUrl; this.mdFormat = mdFormat; this.set = set; @@ -188,7 +188,7 @@ public class OaiIterator implements Iterator { return doc.valueOf("//*[local-name()='resumptionToken']"); } - public CollectorPluginReport getReport() { + public AggregatorReport getReport() { return report; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java index a72a62f13..48f6a94c8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIteratorFactory.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.oai; import java.util.Iterator; -import eu.dnetlib.dhp.collection.CollectorPluginReport; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.HttpClientParams; import eu.dnetlib.dhp.collection.HttpConnector2; @@ -18,7 +18,7 @@ public class OaiIteratorFactory { final String fromDate, final String untilDate, final HttpClientParams clientParams, - final CollectorPluginReport report) { + final AggregatorReport report) { return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector(clientParams), report); } From 545f8f3e485151c3ac3d865d240c5cd0c4b2f058 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Feb 2021 12:15:00 +0100 Subject: [PATCH 54/86] using jackson objectmapper instead of GSon to serialise the aggregation report --- .../eu/dnetlib/dhp/aggregation/common/AggregatorReport.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java index 269f8f6e9..9f91c4247 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java @@ -8,6 +8,7 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Objects; +import eu.dnetlib.dhp.utils.DHPUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,7 +40,7 @@ public class AggregatorReport extends LinkedHashMap implements C this.forEach((k, v) -> log.info("{} - {}", k, v)); Map m = new HashMap<>(); - m.put(getClass().getSimpleName().toLowerCase(), new Gson().toJson(values())); + m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values())); messageSender.sendReport(m); } } From cc88701f29eaea235fe596c65d7815a048e49645 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Feb 2021 16:13:54 +0100 Subject: [PATCH 55/86] retry for any Socket exception --- .../main/java/eu/dnetlib/dhp/collection/HttpConnector2.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java index ddf9efa36..9d8b8d34b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/HttpConnector2.java @@ -176,11 +176,11 @@ public class HttpConnector2 { .format( "Unexpected status code: %s errors: %s", urlConn.getResponseCode(), MAPPER.writeValueAsString(report))); - } catch (MalformedURLException | SocketException | UnknownHostException e) { + } catch (MalformedURLException | UnknownHostException e) { log.error(e.getMessage(), e); report.put(e.getClass().getName(), e.getMessage()); throw new CollectorException(e.getMessage(), e); - } catch (SocketTimeoutException e) { + } catch (SocketTimeoutException | SocketException e) { log.error(e.getMessage(), e); report.put(e.getClass().getName(), e.getMessage()); backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); From 58467aaf1eef7043688cf49d2c5cc1ffd745dca3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Feb 2021 16:14:41 +0100 Subject: [PATCH 56/86] WIP: transformation workflow error reporting --- .../aggregation/common/AggregatorReport.java | 2 +- .../transformation/TransformSparkJobNode.java | 54 +++++++++++++------ .../dhp/transformation/oozie_app/workflow.xml | 10 ++++ .../transformation_input_parameters.json | 15 ++++-- 4 files changed, 61 insertions(+), 20 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java index 9f91c4247..c822a6723 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/aggregation/common/AggregatorReport.java @@ -8,13 +8,13 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Objects; -import eu.dnetlib.dhp.utils.DHPUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.gson.Gson; import eu.dnetlib.dhp.message.MessageSender; +import eu.dnetlib.dhp.utils.DHPUtils; public class AggregatorReport extends LinkedHashMap implements Closeable { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index f9a18987d..0b3de6490 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -21,11 +21,14 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.message.MessageSender; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import parquet.hadoop.ParquetReader; public class TransformSparkJobNode { @@ -54,7 +57,7 @@ public class TransformSparkJobNode { final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; log.info("inputPath: {}", inputPath); - + ParquetReader final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); final String outputBasePath = cleanedMdStoreVersion.getHdfsPath(); log.info("outputBasePath: {}", outputBasePath); @@ -91,23 +94,42 @@ public class TransformSparkJobNode { final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark - .read() - .format("parquet") - .load(inputPath) - .as(encoder) - .map( - TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), - encoder); - saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH); + final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL); + log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); - log.info("Transformed item " + ct.getProcessedItems().count()); - log.info("Total item " + ct.getTotalItems().count()); - log.info("Transformation Error item " + ct.getErrorItems().count()); + final String workflowId = args.get("workflowId"); + log.info("workflowId is {}", workflowId); - writeHdfsFile( - spark.sparkContext().hadoopConfiguration(), - "" + spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(), outputBasePath + MDSTORE_SIZE_PATH); + final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId); + try (AggregatorReport report = new AggregatorReport(messageSender)) { + try { + final Dataset mdstore = spark + .read() + .format("parquet") + .load(inputPath) + .as(encoder) + .map( + TransformationFactory.getTransformationPlugin(args, ct, isLookUpService), + encoder); + saveDataset(mdstore, outputBasePath + MDSTORE_DATA_PATH); + + log.info("Transformed item " + ct.getProcessedItems().count()); + log.info("Total item " + ct.getTotalItems().count()); + log.info("Transformation Error item " + ct.getErrorItems().count()); + + final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(); + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), + "" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH); + } catch (Throwable e) { + log.error("error during record transformation", e); + report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage()); + report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString()); + report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString()); + report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString()); + throw e; + } + } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 9e01936d4..61e5710fa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -29,6 +29,14 @@ isLookupUrl The IS lookUp service endopoint + + workflowId + The identifier of the workflow + + + dnetMessageManagerURL + The URI of the Dnet Message Manager + @@ -95,6 +103,8 @@ --transformationPlugin${transformationPlugin} --transformationRuleId${transformationRuleId} --isLookupUrl${isLookupUrl} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index d92698de5..ee9099dde 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -36,9 +36,18 @@ "paramDescription": "the Information System Service LookUp URL", "paramRequired": true }, - - - + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": true + }, { "paramName": "tp", "paramLongName": "transformationPlugin", From e7eba9f7e71afd638ad00dcb2d1b83521aa74719 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 17 Feb 2021 16:54:08 +0100 Subject: [PATCH 57/86] WIP: transformation workflow error reporting; cleanup --- .../eu/dnetlib/dhp/transformation/TransformSparkJobNode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 0b3de6490..cc130c376 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -57,7 +57,7 @@ public class TransformSparkJobNode { final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; log.info("inputPath: {}", inputPath); - ParquetReader + final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); final String outputBasePath = cleanedMdStoreVersion.getHdfsPath(); log.info("outputBasePath: {}", outputBasePath); From fc3fa5e3436db1c1144afa9703e2d5e3a4e4a9c2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 24 Feb 2021 15:07:24 +0100 Subject: [PATCH 58/86] implemented mdstore collector plugin --- dhp-common/pom.xml | 4 +++ .../eu/dnetlib/dhp}/common/MdstoreClient.java | 15 +++++++- .../dhp/common/rest/DNetRestClient.java | 15 ++++++++ .../dhp/collection/CollectorWorker.java | 4 +-- ...lugin.java => MDStoreCollectorPlugin.java} | 36 +++++++------------ .../raw/MigrateMongoMdstoresApplication.java | 2 +- 6 files changed, 49 insertions(+), 27 deletions(-) rename {dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw => dhp-common/src/main/java/eu/dnetlib/dhp}/common/MdstoreClient.java (84%) rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/{MongoDbCollectorPlugin.java => MDStoreCollectorPlugin.java} (54%) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index e2db8b451..7c8be8d3e 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -98,6 +98,10 @@ httpclient + + org.mongodb + mongo-java-driver + eu.dnetlib.dhp diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java similarity index 84% rename from dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java index a2177935a..236e4d8b0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -1,13 +1,16 @@ -package eu.dnetlib.dhp.oa.graph.raw.common; +package eu.dnetlib.dhp.common; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import java.util.stream.StreamSupport; +import com.mongodb.BasicDBObject; +import com.mongodb.QueryBuilder; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -34,6 +37,16 @@ public class MdstoreClient implements Closeable { this.db = getDb(client, dbName); } + public MongoCollection mdStore(final String mdId) { + BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get(); + + final String currentId = Optional.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) + .map(r -> r.first()) + .map(d -> d.getString("currentId")) + .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); + return getColl(db, currentId, true); + } + public Map validCollections( final String mdFormat, final String mdLayout, final String mdInterpretation) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java index 014f18606..27713d9b5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -11,9 +11,16 @@ import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Arrays; +import java.util.stream.Collectors; public class DNetRestClient { + private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class); + private static ObjectMapper mapper = new ObjectMapper(); public static T doGET(final String url, Class clazz) throws Exception { @@ -44,6 +51,14 @@ public class DNetRestClient { private static String doHTTPRequest(final HttpUriRequest r) throws Exception { CloseableHttpClient client = HttpClients.createDefault(); + + log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString()); + log.info("request headers: {}", + Arrays.asList(r.getAllHeaders()) + .stream() + .map(h -> h.getName() + ":" + h.getValue()) + .collect(Collectors.joining(","))); + CloseableHttpResponse response = client.execute(r); return IOUtils.toString(response.getEntity().getContent()); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index a397c4f9d..ef29cb5b1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -21,7 +21,7 @@ import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.aggregation.common.ReporterCallback; import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; @@ -119,7 +119,7 @@ public class CollectorWorker extends ReportingJob { case mdstore_mongodb_dump: return new MongoDbDumpCollectorPlugin(fileSystem); case mdstore_mongodb: - return new MongoDbCollectorPlugin(); + return new MDStoreCollectorPlugin(); default: throw new UnknownCollectorPluginException("plugin is not managed: " + plugin); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java similarity index 54% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java index 89b92ffa1..33b9111dd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MongoDbCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java @@ -7,48 +7,38 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import org.bson.Document; - -import com.mongodb.MongoClient; import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; +import eu.dnetlib.dhp.common.MdstoreClient; import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import org.bson.Document; -public class MongoDbCollectorPlugin implements CollectorPlugin { +public class MDStoreCollectorPlugin implements CollectorPlugin { - public static final String MONGODB_HOST = "mongodb_host"; - public static final String MONGODB_PORT = "mongodb_port"; - public static final String MONGODB_COLLECTION = "mongodb_collection"; + public static final String MONGODB_BASEURL = "mongodb_baseurl"; public static final String MONGODB_DBNAME = "mongodb_dbname"; + public static final String MDSTORE_ID = "mongodb_collection"; @Override public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { - final String host = Optional - .ofNullable(api.getParams().get(MONGODB_HOST)) - .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_HOST))); - - final Integer port = Optional - .ofNullable(api.getParams().get(MONGODB_PORT)) - .map(Integer::parseInt) - .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_PORT))); + final String mongoBaseUrl = Optional + .ofNullable(api.getParams().get(MONGODB_BASEURL)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_BASEURL))); final String dbName = Optional .ofNullable(api.getParams().get(MONGODB_DBNAME)) .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME))); - final String collection = Optional - .ofNullable(api.getParams().get(MONGODB_COLLECTION)) - .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_COLLECTION))); - - final MongoClient mongoClient = new MongoClient(host, port); - final MongoDatabase database = mongoClient.getDatabase(dbName); - final MongoCollection mdstore = database.getCollection(collection); + final String mdId = Optional + .ofNullable(api.getParams().get(MDSTORE_ID)) + .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID))); + final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName); + final MongoCollection mdstore = client.mdStore(mdId); long size = mdstore.count(); return StreamSupport diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index e7703bf72..9e7e051de 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -12,7 +12,7 @@ import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; +import eu.dnetlib.dhp.common.MdstoreClient; public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { From 9c899f44335278053fb47524dd112ff476a15488 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 24 Feb 2021 15:07:59 +0100 Subject: [PATCH 59/86] cleanup on transformation functions and the relative tests --- .../dhp/transformation/xslt/Cleaner.java | 7 +- .../dhp/transformation/xslt/DateCleaner.java | 6 +- .../xslt/XSLTTransformationFunction.java | 2 + .../transformation/TransformationJobTest.java | 97 +++++++------------ .../eu/dnetlib/dhp/transform/ext_simple.xsl | 2 +- .../eu/dnetlib/dhp/transform/zenodo_tr.xslt | 4 +- 6 files changed, 51 insertions(+), 67 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 124f68325..50ffd304b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -4,7 +4,10 @@ package eu.dnetlib.dhp.transformation.xslt; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Qualifier; import net.sf.saxon.s9api.*; -import scala.Serializable; + +import java.io.Serializable; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; public class Cleaner implements ExtensionFunction, Serializable { @@ -16,7 +19,7 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public QName getName() { - return new QName("http://eu/dnetlib/transform/extension", "clean"); + return new QName(QNAME_BASE_URI + "/clean", "clean"); } @Override diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java index 4e1a29b52..479dd9854 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.transformation.xslt; +import java.io.Serializable; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.*; @@ -8,7 +9,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sf.saxon.s9api.*; -import scala.Serializable; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; public class DateCleaner implements ExtensionFunction, Serializable { @@ -91,7 +93,7 @@ public class DateCleaner implements ExtensionFunction, Serializable { @Override public QName getName() { - return new QName("http://eu/dnetlib/trasform/dates", "dateISO"); + return new QName(QNAME_BASE_URI + "/dateISO", "dateISO"); } @Override diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index 7d47cc84d..a813d84db 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -15,6 +15,8 @@ import net.sf.saxon.s9api.*; public class XSLTTransformationFunction implements MapFunction { + public final static String QNAME_BASE_URI = "http://eu/dnetlib/transform"; + private final AggregationCounter aggregationCounter; private final String transformationRule; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 091089eb9..50aa2ea08 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.Map; import java.util.stream.Collectors; @@ -35,26 +34,11 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @ExtendWith(MockitoExtension.class) public class TransformationJobTest extends AbstractVocabularyTest { - private static SparkSession spark; - - @BeforeAll - public static void beforeAll() throws IOException, ISLookUpException { - SparkConf conf = new SparkConf(); - conf.setAppName(TransformationJobTest.class.getSimpleName()); - conf.setMaster("local"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } - @BeforeEach public void setUp() throws IOException, ISLookUpException { setUpVocabulary(); } - @AfterAll - public static void afterAll() { - spark.stop(); - } - @Test @DisplayName("Test Date cleaner") public void testDateCleaner() throws Exception { @@ -82,68 +66,61 @@ public class TransformationJobTest extends AbstractVocabularyTest { // Print the record System.out.println(result.getBody()); // TODO Create significant Assert - } - @DisplayName("Test TransformSparkJobNode.main") @Test + @DisplayName("Test TransformSparkJobNode.main") public void transformTest(@TempDir Path testDir) throws Exception { - final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - final String mdstore_output = testDir.toString() + "/version"; + SparkConf conf = new SparkConf(); + conf.setAppName(TransformationJobTest.class.getSimpleName()); + conf.setMaster("local"); - mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); + try(SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { - final Map parameters = Stream.of(new String[][] { - { - "dateOfTransformation", "1234" - }, - { - "transformationPlugin", "XSLT_TRANSFORM" - }, - { - "transformationRuleId", "simpleTRule" - }, + final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String mdstore_output = testDir.toString() + "/version"; - }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); - TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); + final Map parameters = Stream.of(new String[][]{ + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, - // TODO introduce useful assertions + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mOutput = spark - .read() - .format("parquet") - .load(mdstore_output + MDSTORE_DATA_PATH) - .as(encoder); + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); - final Long total = mOutput.count(); + // TODO introduce useful assertions - final long recordTs = mOutput - .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) - .count(); + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); - final long recordNotEmpty = mOutput - .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) - .count(); + final Long total = mOutput.count(); - assertEquals(total, recordTs); + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); - assertEquals(total, recordNotEmpty); + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); - } + assertEquals(total, recordTs); - @Test - public void tryLoadFolderOnCP() throws Exception { - final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - System.out.println("path = " + path); - - Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); - - System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); - - Files.deleteIfExists(tempDirWithPrefix); + assertEquals(total, recordNotEmpty); + } } private XSLTTransformationFunction loadTransformationRule(final String path) throws Exception { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index e2a439315..8f8ce2270 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -1,7 +1,7 @@ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt index 23e57579b..9a02c9071 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/zenodo_tr.xslt @@ -3,8 +3,8 @@ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:oaf="http://namespace.openaire.eu/oaf" - xmlns:vocabulary="http://eu/dnetlib/trasform/extension" - xmlns:dateCleaner="http://eu/dnetlib/trasform/dates" + xmlns:vocabulary="http://eu/dnetlib/transform/clean" + xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:dr="http://www.driver-repository.eu/namespace/dr" exclude-result-prefixes="xsl vocabulary dateCleaner"> From 271e88537bc29089611b4c3b5b14b7b552ee3eef Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Feb 2021 12:28:56 +0100 Subject: [PATCH 60/86] code formatting --- .../dhp/transformation/xslt/Cleaner.java | 8 ++-- .../dhp/transformation/xslt/DateCleaner.java | 4 +- .../transformation/TransformationJobTest.java | 43 ++++++++++--------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java index 50ffd304b..664215c0e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -1,14 +1,14 @@ package eu.dnetlib.dhp.transformation.xslt; +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; + import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Qualifier; import net.sf.saxon.s9api.*; -import java.io.Serializable; - -import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; - public class Cleaner implements ExtensionFunction, Serializable { private final VocabularyGroup vocabularies; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java index 479dd9854..6e337604f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.transformation.xslt; +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + import java.io.Serializable; import java.time.LocalDate; import java.time.format.DateTimeFormatter; @@ -10,8 +12,6 @@ import java.util.regex.Pattern; import net.sf.saxon.s9api.*; -import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; - public class DateCleaner implements ExtensionFunction, Serializable { private final static List dateRegex = Arrays diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 50aa2ea08..3c0c8bf0f 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -76,23 +76,26 @@ public class TransformationJobTest extends AbstractVocabularyTest { conf.setAppName(TransformationJobTest.class.getSimpleName()); conf.setMaster("local"); - try(SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { - final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String mdstore_input = this + .getClass() + .getResource("/eu/dnetlib/dhp/transform/mdstorenative") + .getFile(); final String mdstore_output = testDir.toString() + "/version"; mockupTrasformationRule("simpleTRule", "/eu/dnetlib/dhp/transform/ext_simple.xsl"); - final Map parameters = Stream.of(new String[][]{ - { - "dateOfTransformation", "1234" - }, - { - "transformationPlugin", "XSLT_TRANSFORM" - }, - { - "transformationRuleId", "simpleTRule" - }, + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, }).collect(Collectors.toMap(data -> data[0], data -> data[1])); @@ -102,20 +105,20 @@ public class TransformationJobTest extends AbstractVocabularyTest { final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mOutput = spark - .read() - .format("parquet") - .load(mdstore_output + MDSTORE_DATA_PATH) - .as(encoder); + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); final Long total = mOutput.count(); final long recordTs = mOutput - .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) - .count(); + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); final long recordNotEmpty = mOutput - .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) - .count(); + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); assertEquals(total, recordTs); From dc98c39500a74fb55d5e951d80f1c38bcb286ec4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Feb 2021 12:29:18 +0100 Subject: [PATCH 61/86] more logging --- .../dhp/common/rest/DNetRestClient.java | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java index 27713d9b5..853d22bc2 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -1,6 +1,9 @@ package eu.dnetlib.dhp.common.rest; +import java.util.Arrays; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; @@ -9,13 +12,10 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; - -import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Arrays; -import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; public class DNetRestClient { @@ -53,11 +53,14 @@ public class DNetRestClient { CloseableHttpClient client = HttpClients.createDefault(); log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString()); - log.info("request headers: {}", - Arrays.asList(r.getAllHeaders()) - .stream() - .map(h -> h.getName() + ":" + h.getValue()) - .collect(Collectors.joining(","))); + log + .info( + "request headers: {}", + Arrays + .asList(r.getAllHeaders()) + .stream() + .map(h -> h.getName() + ":" + h.getValue()) + .collect(Collectors.joining(","))); CloseableHttpResponse response = client.execute(r); return IOUtils.toString(response.getEntity().getContent()); From b830e333922e9b421c4955c07d03c3b5aad4fc75 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Feb 2021 12:30:30 +0100 Subject: [PATCH 62/86] mdstore collector plugin --- .../java/eu/dnetlib/dhp/common/MdstoreClient.java | 13 +++++++------ .../plugin/mongodb/MDStoreCollectorPlugin.java | 14 ++++++++------ .../graph/raw/MigrateMongoMdstoresApplication.java | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java index 236e4d8b0..d29498306 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -9,16 +9,16 @@ import java.util.Map; import java.util.Optional; import java.util.stream.StreamSupport; -import com.mongodb.BasicDBObject; -import com.mongodb.QueryBuilder; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bson.Document; import com.google.common.collect.Iterables; +import com.mongodb.BasicDBObject; import com.mongodb.MongoClient; import com.mongodb.MongoClientURI; +import com.mongodb.QueryBuilder; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; @@ -40,10 +40,11 @@ public class MdstoreClient implements Closeable { public MongoCollection mdStore(final String mdId) { BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get(); - final String currentId = Optional.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) - .map(r -> r.first()) - .map(d -> d.getString("currentId")) - .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); + final String currentId = Optional + .ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) + .map(r -> r.first()) + .map(d -> d.getString("currentId")) + .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); return getColl(db, currentId, true); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java index 33b9111dd..77e899cc9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java @@ -7,27 +7,29 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import org.bson.Document; + import com.mongodb.client.MongoCollection; -import eu.dnetlib.dhp.common.MdstoreClient; import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.collection.ApiDescriptor; import eu.dnetlib.dhp.collection.CollectorException; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import org.bson.Document; +import eu.dnetlib.dhp.common.MdstoreClient; public class MDStoreCollectorPlugin implements CollectorPlugin { - public static final String MONGODB_BASEURL = "mongodb_baseurl"; public static final String MONGODB_DBNAME = "mongodb_dbname"; - public static final String MDSTORE_ID = "mongodb_collection"; + public static final String MDSTORE_ID = "mdstore_id"; @Override public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { final String mongoBaseUrl = Optional - .ofNullable(api.getParams().get(MONGODB_BASEURL)) - .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_BASEURL))); + .ofNullable(api.getBaseUrl()) + .orElseThrow( + () -> new CollectorException( + "missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl")); final String dbName = Optional .ofNullable(api.getParams().get(MONGODB_DBNAME)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index 9e7e051de..50042b569 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -11,8 +11,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.common.MdstoreClient; +import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { From 7df2461ccc8b3a1362463cdb9f84613a9c45e31d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Feb 2021 16:19:12 +0100 Subject: [PATCH 63/86] indent XML records collected from oai-pmh endpoints --- .../collection/plugin/oai/OaiIterator.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 887027f21..65695fe8e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -1,7 +1,9 @@ package eu.dnetlib.dhp.collection.plugin.oai; +import java.io.IOException; import java.io.StringReader; +import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Iterator; @@ -11,8 +13,11 @@ import java.util.concurrent.PriorityBlockingQueue; import org.apache.commons.lang.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; +import org.dom4j.DocumentHelper; import org.dom4j.Node; +import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; +import org.dom4j.io.XMLWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -28,7 +33,6 @@ public class OaiIterator implements Iterator { private final static String REPORT_PREFIX = "oai:"; private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); private final String baseUrl; private final String set; @@ -149,13 +153,13 @@ public class OaiIterator implements Iterator { final String xml = httpConnector.getInputSource(url, report); Document doc; try { - doc = reader.read(new StringReader(xml)); + doc = DocumentHelper.parseText(xml); } catch (final DocumentException e) { log.warn("Error parsing xml, I try to clean it. {}", e.getMessage()); report.put(e.getClass().getName(), e.getMessage()); final String cleaned = XmlCleaner.cleanAllEntities(xml); try { - doc = reader.read(new StringReader(cleaned)); + doc = DocumentHelper.parseText(xml); } catch (final DocumentException e1) { final String resumptionToken = extractResumptionToken(xml); if (resumptionToken == null) { @@ -182,7 +186,15 @@ public class OaiIterator implements Iterator { } for (final Object o : doc.selectNodes("//*[local-name()='ListRecords']/*[local-name()='record']")) { - queue.add(((Node) o).asXML()); + final StringWriter sw = new StringWriter(); + final XMLWriter writer = new XMLWriter(sw, OutputFormat.createPrettyPrint()); + try { + writer.write((Node) o); + queue.add(sw.toString()); + } catch (IOException e) { + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException("Error parsing XML record:\n" + ((Node) o).asXML(), e); + } } return doc.valueOf("//*[local-name()='resumptionToken']"); From e76c4f62c1b1d6a35018f5fc27eb5377a9da94be Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 26 Feb 2021 10:58:48 +0100 Subject: [PATCH 64/86] MetadataRecord moved in dhp-schemas --- dhp-schemas/pom.xml | 5 +++++ .../dhp/schema/common/ModelSupport.java | 19 +++++++++++++++++++ .../dhp/schema}/mdstore/MetadataRecord.java | 12 +++++++----- .../dhp/schema}/mdstore/Provenance.java | 2 +- .../GenerateDataciteDatasetSpark.scala | 2 +- .../GenerateNativeStoreSparkJob.java | 4 ++-- .../transformation/TransformSparkJobNode.java | 3 +-- .../transformation/TransformationFactory.java | 2 +- .../xslt/XSLTTransformationFunction.java | 2 +- .../GenerateNativeStoreSparkJobTest.java | 4 ++-- .../transformation/TransformationJobTest.java | 2 +- 11 files changed, 41 insertions(+), 16 deletions(-) rename {dhp-common/src/main/java/eu/dnetlib/dhp/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema}/mdstore/MetadataRecord.java (87%) rename {dhp-common/src/main/java/eu/dnetlib/dhp/model => dhp-schemas/src/main/java/eu/dnetlib/dhp/schema}/mdstore/Provenance.java (96%) diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 10ee5f9ff..c4bb9e21f 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -67,6 +67,11 @@ guava + + commons-codec + commons-codec + + diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index b5bca2e93..b08e41a55 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -3,11 +3,15 @@ package eu.dnetlib.dhp.schema.common; import static com.google.common.base.Preconditions.checkArgument; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.function.Function; +import org.apache.commons.codec.binary.Hex; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Maps; @@ -473,4 +477,19 @@ public class ModelSupport { private static String idFnForOafEntity(T t) { return ((OafEntity) t).getId(); } + + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes(StandardCharsets.UTF_8)); + return new String(Hex.encodeHex(md.digest())); + } catch (final NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + } + + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s", nsPrefix, md5(originalId)); + } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java similarity index 87% rename from dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java index 0b59dcce0..9586680e3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java @@ -1,12 +1,14 @@ -package eu.dnetlib.dhp.model.mdstore; +package eu.dnetlib.dhp.schema.mdstore; import java.io.Serializable; -import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.dhp.schema.common.ModelSupport; -/** This class models a record inside the new Metadata store collection on HDFS * */ -public class MetadataRecord implements Serializable { +/** + * This class models a record in a Metadata store collection on HDFS + */ + public class MetadataRecord implements Serializable { /** The D-Net Identifier associated to the record */ private String id; @@ -47,7 +49,7 @@ public class MetadataRecord implements Serializable { this.provenance = provenance; this.body = body; this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); + this.id = ModelSupport.generateIdentifier(originalId, this.provenance.getNsPrefix()); } public String getId() { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java similarity index 96% rename from dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java rename to dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java index 556535022..8af58f628 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/Provenance.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.model.mdstore; +package eu.dnetlib.dhp.schema.mdstore; import java.io.Serializable; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala index f04f92c63..168ad218a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.actionmanager.datacite import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.model.mdstore.MetadataRecord +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.spark.SparkConf diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index ee82cc94f..043da31f9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -30,8 +30,8 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.Provenance; import scala.Tuple2; public class GenerateNativeStoreSparkJob { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index cc130c376..6a0938708 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -25,10 +25,9 @@ import eu.dnetlib.dhp.aggregation.common.AggregatorReport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.message.MessageSender; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import parquet.hadoop.ParquetReader; public class TransformSparkJobNode { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java index 45ba2981f..096d0e289 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -11,7 +11,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index a813d84db..d9b38e572 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -10,7 +10,7 @@ import org.apache.spark.api.java.function.MapFunction; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; import net.sf.saxon.s9api.*; public class XSLTTransformationFunction implements MapFunction { diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java index 723f030a6..b8eb58ec2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJobTest.java @@ -38,8 +38,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion; import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.Provenance; import eu.dnetlib.dhp.transformation.TransformSparkJobNode; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 3c0c8bf0f..e29a8ac50 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -26,7 +26,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.xslt.DateCleaner; import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; From b73dce3e3a1a97aa899e1ef2b8a92067026d2508 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Mar 2021 10:17:16 +0100 Subject: [PATCH 65/86] more logging on the MDStore mongodb client. Forcing UTF_8 encoding on the content --- .../java/eu/dnetlib/dhp/common/MdstoreClient.java | 11 +++++++++-- .../plugin/mongodb/MDStoreCollectorPlugin.java | 7 +++++++ .../xslt/XSLTTransformationFunction.java | 8 ++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java index d29498306..38837b557 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -21,17 +21,19 @@ import com.mongodb.MongoClientURI; import com.mongodb.QueryBuilder; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class MdstoreClient implements Closeable { + private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class); + private final MongoClient client; private final MongoDatabase db; private static final String COLL_METADATA = "metadata"; private static final String COLL_METADATA_MANAGER = "metadataManager"; - private static final Log log = LogFactory.getLog(MdstoreClient.class); - public MdstoreClient(final String baseUrl, final String dbName) { this.client = new MongoClient(new MongoClientURI(baseUrl)); this.db = getDb(client, dbName); @@ -40,11 +42,16 @@ public class MdstoreClient implements Closeable { public MongoCollection mdStore(final String mdId) { BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get(); + log.info("querying current mdId: {}", query.toJson()); + final String currentId = Optional .ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) .map(r -> r.first()) .map(d -> d.getString("currentId")) .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); + + log.info("currentId: {}", currentId); + return getColl(db, currentId, true); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java index 77e899cc9..549c59720 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/mongodb/MDStoreCollectorPlugin.java @@ -8,6 +8,8 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; import org.bson.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.mongodb.client.MongoCollection; @@ -19,6 +21,8 @@ import eu.dnetlib.dhp.common.MdstoreClient; public class MDStoreCollectorPlugin implements CollectorPlugin { + private static final Logger log = LoggerFactory.getLogger(MDStoreCollectorPlugin.class); + public static final String MONGODB_DBNAME = "mongodb_dbname"; public static final String MDSTORE_ID = "mdstore_id"; @@ -30,14 +34,17 @@ public class MDStoreCollectorPlugin implements CollectorPlugin { .orElseThrow( () -> new CollectorException( "missing mongodb baseUrl, expected in eu.dnetlib.dhp.collection.ApiDescriptor.baseUrl")); + log.info("mongoBaseUrl: {}", mongoBaseUrl); final String dbName = Optional .ofNullable(api.getParams().get(MONGODB_DBNAME)) .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MONGODB_DBNAME))); + log.info("dbName: {}", dbName); final String mdId = Optional .ofNullable(api.getParams().get(MDSTORE_ID)) .orElseThrow(() -> new CollectorException(String.format("missing parameter '%s'", MDSTORE_ID))); + log.info("mdId: {}", mdId); final MdstoreClient client = new MdstoreClient(mongoBaseUrl, dbName); final MongoCollection mdstore = client.mdStore(mdId); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java index d9b38e572..430fbcf95 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -3,9 +3,11 @@ package eu.dnetlib.dhp.transformation.xslt; import java.io.ByteArrayInputStream; import java.io.StringWriter; +import java.nio.charset.StandardCharsets; import javax.xml.transform.stream.StreamSource; +import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.function.MapFunction; import eu.dnetlib.dhp.aggregation.common.AggregationCounter; @@ -44,18 +46,20 @@ public class XSLTTransformationFunction implements MapFunction Date: Wed, 3 Mar 2021 10:22:29 +0100 Subject: [PATCH 66/86] removed unused classes --- .../dhp/transformation/vocabulary/Term.java | 53 ------------------ .../transformation/vocabulary/Vocabulary.java | 54 ------------------- .../vocabulary/VocabularyHelper.java | 24 --------- .../vocabulary/VocabularyTest.java | 16 ------ 4 files changed, 147 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java delete mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java delete mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java deleted file mode 100644 index b5ac18169..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ /dev/null @@ -1,53 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; - -public class Term implements Serializable { - - private String englishName; - private String nativeName; - private String encoding; - private String code; - private String synonyms; - - public String getEnglishName() { - return englishName; - } - - public void setEnglishName(String englishName) { - this.englishName = englishName; - } - - public String getNativeName() { - return nativeName; - } - - public void setNativeName(String nativeName) { - this.nativeName = nativeName; - } - - public String getEncoding() { - return encoding; - } - - public void setEncoding(String encoding) { - this.encoding = encoding; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getSynonyms() { - return synonyms; - } - - public void setSynonyms(String synonyms) { - this.synonyms = synonyms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java deleted file mode 100644 index a9da6b725..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.util.List; - -public class Vocabulary implements Serializable { - - private String id; - private String name; - private String description; - private String code; - private List terms; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public List getTerms() { - return terms; - } - - public void setTerms(List terms) { - this.terms = terms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java deleted file mode 100644 index 10e959be0..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ /dev/null @@ -1,24 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.net.URL; -import java.nio.charset.Charset; - -import org.apache.commons.io.IOUtils; - -import com.fasterxml.jackson.databind.ObjectMapper; - -public class VocabularyHelper implements Serializable { - - private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; - - public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { - final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); - - final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); - return vocabulary; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java deleted file mode 100644 index 1ae942a6b..000000000 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import static org.junit.jupiter.api.Assertions.*; - -import org.junit.jupiter.api.Test; - -public class VocabularyTest { - - @Test - public void testLoadVocabulary() throws Exception { - - final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); - assertEquals("dnet:languages", vocabulary.getName()); - } -} From ec80b7ade3dd631874eb78296ce52da3b2205812 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Mar 2021 10:22:53 +0100 Subject: [PATCH 67/86] code formatting --- .../src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java | 4 ++-- .../java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java index 38837b557..0bc782ccb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -13,6 +13,8 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bson.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; import com.mongodb.BasicDBObject; @@ -21,8 +23,6 @@ import com.mongodb.MongoClientURI; import com.mongodb.QueryBuilder; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class MdstoreClient implements Closeable { diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java index 9586680e3..8277e1469 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/mdstore/MetadataRecord.java @@ -8,7 +8,7 @@ import eu.dnetlib.dhp.schema.common.ModelSupport; /** * This class models a record in a Metadata store collection on HDFS */ - public class MetadataRecord implements Serializable { +public class MetadataRecord implements Serializable { /** The D-Net Identifier associated to the record */ private String id; From 55f6ff5f559a7e765b8d0a911fe6ca330bcf7d83 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Mar 2021 16:18:34 +0100 Subject: [PATCH 68/86] README.md for aggregation workflows --- dhp-workflows/dhp-aggregation/README.md | 27 +++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/README.md b/dhp-workflows/dhp-aggregation/README.md index e46fdeb16..5ed6a82d7 100644 --- a/dhp-workflows/dhp-aggregation/README.md +++ b/dhp-workflows/dhp-aggregation/README.md @@ -1,16 +1,27 @@ Description of the Module -------------------------- -This module defines a **collector worker application** that runs on Hadoop. +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. -It is responsible for harvesting metadata using different collector plugins and transformation into the common metadata model. +## Metadata collection -# Collector Plugins -* OAI Plugin +The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to +different formats and to store them as on HDFS so that they can be further processed. + +### Collector Plugins + +Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface: + +```eu.dnetlib.dhp.collection.plugin.CollectorPlugin``` + +The list of the supported plugins: + +* OAI Plugin: collects from OAI-PMH compatible endpoints +* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID) +* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter # Transformation Plugins TODO - -# Usage -TODO - From fa7930d2e2c4aeda1ee42018be065826367dc96e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Mar 2021 15:45:28 +0100 Subject: [PATCH 69/86] merging contributions from PR#97 --- .gitignore | 3 + .../common/vocabulary/VocabularyGroup.java | 24 + .../transformation/TransformationJobTest.java | 122 +- .../eu/dnetlib/dhp/transform/input_itgv4.xml | 70 ++ .../xslt_cleaning_datarepo_datacite.xsl | 432 +++++++ .../xslt_cleaning_datarepo_datacite_orig.xsl | 472 +++++++ .../scripts/xslt_cleaning_oaiOpenaire.xsl | 82 ++ ...enaire_datacite_ExchangeLandingpagePid.xsl | 791 ++++++++++++ ...e_datacite_ExchangeLandingpagePid_orig.xsl | 1081 +++++++++++++++++ .../dhp/transform/scripts/zenodo_tr.xsl | 451 +++++++ dhp-workflows/dhp-graph-mapper/pom.xml | 6 +- pom.xml | 28 +- 12 files changed, 3546 insertions(+), 16 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl diff --git a/.gitignore b/.gitignore index 2d7730711..f4fb46f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ *.iws *~ .vscode +.metals +.bloop .classpath /*/.classpath /*/*/.classpath @@ -24,4 +26,5 @@ spark-warehouse /**/job-override.properties /**/*.log +/**/.factorypath diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index f81181e53..12c6279e5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -122,7 +122,31 @@ public class VocabularyGroup implements Serializable { return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); } + /** + * getSynonymAsQualifierCaseSensitive + * + * refelects the situation to check caseSensitive vocabulary + */ + public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + return vocs.get(vocId).getSynonymAsQualifier(syn); + } + + /** + * termExists + * + * two methods: without and with caseSensitive check + */ public boolean termExists(final String vocId, final String id) { + return termExists(vocId, id, Boolean.FALSE); + } + + public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) { + if (Boolean.TRUE.equals(caseSensitive)) { + return vocabularyExists(vocId) && vocs.get(vocId).termExists(id); + } return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index e29a8ac50..62a5223d9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -51,13 +51,12 @@ public class TransformationJobTest extends AbstractVocabularyTest { } @Test - @DisplayName("Test Transform Single XML using XSLTTransformator") + @DisplayName("Test Transform Single XML using zenodo_tr XSLTTransformator") public void testTransformSaxonHE() throws Exception { // We Set the input Record getting the XML from the classpath final MetadataRecord mr = new MetadataRecord(); mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_zenodo.xml"))); - // We Load the XSLT transformation Rule from the classpath XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); @@ -68,6 +67,125 @@ public class TransformationJobTest extends AbstractVocabularyTest { // TODO Create significant Assert } + @Test + @DisplayName("Test Transform Inst.&Them.v4 record XML with zenodo_tr") + public void testTransformITGv4Zenodo() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule("/eu/dnetlib/dhp/transform/zenodo_tr.xslt"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test Transform Inst.&Them.v4 record XML with xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid") + public void testTransformITGv4() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test Transform record XML with xslt_cleaning_datarepo_datacite") + public void testTransformMostlyUsedScript() throws Exception { + + // We Set the input Record getting the XML from the classpath + final MetadataRecord mr = new MetadataRecord(); + mr.setBody(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input_itgv4.xml"))); + // We Load the XSLT transformation Rule from the classpath + XSLTTransformationFunction tr = loadTransformationRule( + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl"); + + MetadataRecord result = tr.call(mr); + + // Print the record + System.out.println(result.getBody()); + // TODO Create significant Assert + } + + @Test + @DisplayName("Test TransformSparkJobNode.main with oaiOpenaire_datacite (v4)") + public void transformTestITGv4OAIdatacite(@TempDir Path testDir) throws Exception { + + SparkConf conf = new SparkConf(); + conf.setAppName(TransformationJobTest.class.getSimpleName()); + conf.setMaster("local"); + + try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + + final String mdstore_input = this + .getClass() + .getResource("/eu/dnetlib/dhp/transform/mdstorenative") + .getFile(); + final String mdstore_output = testDir.toString() + "/version"; + + mockupTrasformationRule( + "simpleTRule", + "/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl"); + + final Map parameters = Stream.of(new String[][] { + { + "dateOfTransformation", "1234" + }, + { + "varOfficialName", "Publications at Bielefeld University" + }, + { + "varOfficialId", "opendoar____::2294" + }, + { + "transformationPlugin", "XSLT_TRANSFORM" + }, + { + "transformationRuleId", "simpleTRule" + }, + + }).collect(Collectors.toMap(data -> data[0], data -> data[1])); + + TransformSparkJobNode.transformRecords(parameters, isLookUpService, spark, mdstore_input, mdstore_output); + + // TODO introduce useful assertions + + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mOutput = spark + .read() + .format("parquet") + .load(mdstore_output + MDSTORE_DATA_PATH) + .as(encoder); + + final Long total = mOutput.count(); + + final long recordTs = mOutput + .filter((FilterFunction) p -> p.getDateOfTransformation() == 1234) + .count(); + + final long recordNotEmpty = mOutput + .filter((FilterFunction) p -> !StringUtils.isBlank(p.getBody())) + .count(); + + assertEquals(total, recordTs); + + assertEquals(total, recordNotEmpty); + } + } + @Test @DisplayName("Test TransformSparkJobNode.main") public void transformTest(@TempDir Path testDir) throws Exception { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml new file mode 100644 index 000000000..06325810b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/input_itgv4.xml @@ -0,0 +1,70 @@ + + + + + od______2294::0000955eab68583ba0e07e973dd48708 + oai:pub.uni-bielefeld.de:1997560 + 2021-02-23T13:14:00.839Z + od______2294 + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + journal_article + doc-type:article + + + + Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit + + + Schulz, Raimund + + + + https://pub.uni-bielefeld.de/record/1997560.json + + + 0016-9056 + + ger + Friedrich + 2002 + journal article + https://pub.uni-bielefeld.de/record/1997560 + metadata only access + Schulz R. Die antiken Grundlagen der europäischen Expansion. Eine epochenübergreifende kulturhistorische Unterrichtseinheit. GWU. 2002;53(5-/6):340-360. + + In Copyright + GWU + 53 + 5-/6 + + + + + + http%3A%2F%2Fpub.uni-bielefeld.de%2Foai + oai:pub.uni-bielefeld.de:1997560 + 2018-07-24T12:58:03Z + + + + + false + false + 0.9 + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl new file mode 100644 index 000000000..f815c0260 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite.xsl @@ -0,0 +1,432 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl new file mode 100644 index 000000000..d8b14fadd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_datarepo_datacite_orig.xsl @@ -0,0 +1,472 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + OPEN + + + + + RESTRICTED + + + + + UNKNOWN + + + + + + + + + + + + + + + DE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl new file mode 100644 index 000000000..53a3466a9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire.xsl @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl new file mode 100644 index 000000000..56451505e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid.xsl @@ -0,0 +1,791 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl new file mode 100644 index 000000000..3cfaec80b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/xslt_cleaning_oaiOpenaire_datacite_ExchangeLandingpagePid_orig.xsl @@ -0,0 +1,1081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + record is not compliant, transformation is interrupted. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl new file mode 100644 index 000000000..0c3f4b1f9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/scripts/zenodo_tr.xsl @@ -0,0 +1,451 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + OPEN + + + + + CLOSED + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 5e8448182..81d93f97b 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -33,6 +33,10 @@ + + -Xmax-classfile-name + 140 + ${scala.version} @@ -67,7 +71,7 @@ test - org.apache.httpcomponents + org.apache.httpcomponents httpclient diff --git a/pom.xml b/pom.xml index bef649c67..45bb6bf78 100644 --- a/pom.xml +++ b/pom.xml @@ -362,7 +362,7 @@ ${dnet.openaire.broker.common} - + org.apache.cxf cxf-rt-transports-http 3.1.5 @@ -406,20 +406,20 @@ 4.0 - - com.ximpleware - vtd-xml - ${vtd.version} - + + com.ximpleware + vtd-xml + ${vtd.version} + - - org.elasticsearch - elasticsearch-hadoop - 7.6.0 - + + org.elasticsearch + elasticsearch-hadoop + 7.6.0 + - + org.apache.oozie oozie-client ${dhp.oozie.version} @@ -685,6 +685,8 @@ UTF-8 UTF-8 3.6.0 + 1.8 + 1.8 2.22.2 2.0.1 cdh5.9.2 @@ -711,4 +713,4 @@ 4.5.3 4.0.1 - + \ No newline at end of file From acbe3119a4a5510bfd6ab887c6165e96947e6409 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Mar 2021 09:44:09 +0100 Subject: [PATCH 70/86] RestCollectorPlugin imported from dne45 --- dhp-workflows/dhp-aggregation/pom.xml | 5 + .../dhp/collection/CollectorWorker.java | 3 + .../eu/dnetlib/dhp/collection/JsonUtils.java | 84 ++++ .../collection/plugin/CollectorPlugin.java | 2 +- .../plugin/rest/RestCollectorPlugin.java | 92 ++++ .../collection/plugin/rest/RestIterator.java | 442 ++++++++++++++++++ .../plugin/rest/RestCollectorPluginTest.java | 81 ++++ .../plugin/rest/RestIteratorTest.java | 54 +++ pom.xml | 6 + 9 files changed, 768 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 6887be55e..cfe9e74fd 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -86,6 +86,11 @@ jaxen + + org.json + json + + org.apache.commons diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index ef29cb5b1..f9d7d7dae 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -24,6 +24,7 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; public class CollectorWorker extends ReportingJob { @@ -109,6 +110,8 @@ public class CollectorWorker extends ReportingJob { switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) { case oai: return new OaiCollectorPlugin(clientParams); + case rest_json2xml: + return new RestCollectorPlugin(clientParams); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(api.getParams().get("other_plugin_type")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java new file mode 100644 index 000000000..da3768a4a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class JsonUtils { + + private static final Log log = LogFactory.getLog(JsonUtils.class); + + public static final String wrapName = "recordWrap"; + + /** + * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' + * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names + * and work-around for the JSON to XML converting of org.json.XML-package. + * + * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], + * + * @param jsonInput + * @return convertedJsonKeynameOutput + */ + public String syntaxConvertJsonKeyNames(String jsonInput) { + + log.trace("before convertJsonKeyNames: " + jsonInput); + // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) + // replace ' 's in JSON Namens with '_' + while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); + } + + // replace forward-slash (sign '/' ) in JSON Names with '_' + while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); + } + + // replace '(' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); + } + + // replace ')' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); + } + + // add prefix of startNumbers in JSON Keynames with 'n_' + while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); + } + // add prefix of only numbers in JSON Keynames with 'm_' + while (jsonInput.matches(".*\"([0-9]+)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); + } + + // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' + while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); + } + + // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. + // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { + // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); + // } + + // replace '=' in JSON Keynames with '-' + while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); + } + + log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); + return jsonInput; + } + + public String convertToXML(final String jsonRecord) { + String resultXml = ""; + org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); + resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element + log.trace("before inputStream: " + resultXml); + resultXml = XmlCleaner.cleanAllEntities(resultXml); + log.trace("after cleaning: " + resultXml); + return resultXml; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 0ed6be5fa..457f63468 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.collection.CollectorException; public interface CollectorPlugin { enum NAME { - oai, other; + oai, other, rest_json2xml; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java new file mode 100644 index 000000000..ad8bfa4ea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; + +/** + * TODO: delegate HTTP requests to the common HttpConnector2 implementation. + * + * @author js, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestCollectorPlugin implements CollectorPlugin { + + private HttpClientParams clientParams; + + public RestCollectorPlugin(HttpClientParams clientParams) { + this.clientParams = clientParams; + } + + @Override + public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { + final String baseUrl = api.getBaseUrl(); + final String resumptionType = api.getParams().get("resumptionType"); + final String resumptionParam = api.getParams().get("resumptionParam"); + final String resumptionXpath = api.getParams().get("resumptionXpath"); + final String resultTotalXpath = api.getParams().get("resultTotalXpath"); + final String resultFormatParam = api.getParams().get("resultFormatParam"); + final String resultFormatValue = api.getParams().get("resultFormatValue"); + final String resultSizeParam = api.getParams().get("resultSizeParam"); + final String resultSizeValue = (StringUtils.isBlank(api.getParams().get("resultSizeValue"))) ? "100" + : api.getParams().get("resultSizeValue"); + final String queryParams = api.getParams().get("queryParams"); + final String entityXpath = api.getParams().get("entityXpath"); + final String authMethod = api.getParams().get("authMethod"); + final String authToken = api.getParams().get("authToken"); + + if (StringUtils.isBlank(baseUrl)) { + throw new CollectorException("Param 'baseUrl' is null or empty"); + } + if (StringUtils.isBlank(resumptionType)) { + throw new CollectorException("Param 'resumptionType' is null or empty"); + } + if (StringUtils.isBlank(resumptionParam)) { + throw new CollectorException("Param 'resumptionParam' is null or empty"); + } + if (StringUtils.isBlank(resultFormatValue)) { + throw new CollectorException("Param 'resultFormatValue' is null or empty"); + } + if (StringUtils.isBlank(queryParams)) { + throw new CollectorException("Param 'queryParams' is null or empty"); + } + if (StringUtils.isBlank(entityXpath)) { + throw new CollectorException("Param 'entityXpath' is null or empty"); + } + + RestIterator it = new RestIterator( + getClientParams(), + baseUrl, + resumptionType, + resumptionParam, + resumptionXpath, + resultTotalXpath, + resultFormatParam, + resultFormatValue, + resultSizeParam, + resultSizeValue, + queryParams, + entityXpath, + authMethod, + authToken); + + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false); + } + + public HttpClientParams getClientParams() { + return clientParams; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java new file mode 100644 index 000000000..b728293d5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -0,0 +1,442 @@ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.io.InputStream; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Iterator; +import java.util.Queue; +import java.util.concurrent.PriorityBlockingQueue; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.*; + +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.HttpHeaders; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.JsonUtils; + +/** + * log.debug(...) equal to log.trace(...) in the application-logs + *

+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue + * + * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestIterator implements Iterator { + + private static final Log log = LogFactory.getLog(RestIterator.class); + + private HttpClientParams clientParams; + + private final String BASIC = "basic"; + + private JsonUtils jsonUtils; + + private String baseUrl; + private String resumptionType; + private String resumptionParam; + private String resultFormatValue; + private String queryParams; + private int resultSizeValue; + private int resumptionInt = 0; // integer resumption token (first record to harvest) + private int resultTotal = -1; + private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest + // or token scanned from results) + private InputStream resultStream; + private Transformer transformer; + private XPath xpath; + private String query; + private XPathExpression xprResultTotalPath; + private XPathExpression xprResumptionPath; + private XPathExpression xprEntity; + private String queryFormat; + private String querySize; + private String authMethod; + private String authToken; + private final Queue recordQueue = new PriorityBlockingQueue(); + private int discoverResultSize = 0; + private int pagination = 1; + + /** + * RestIterator class + * + * compatible to version before 1.3.33 + * + * @param baseUrl + * @param resumptionType + * @param resumptionParam + * @param resumptionXpath + * @param resultTotalXpath + * @param resultFormatParam + * @param resultFormatValue + * @param resultSizeParam + * @param resultSizeValueStr + * @param queryParams + * @param entityXpath + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath) { + this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, + resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", + ""); + } + + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken, + final String resultOffsetParam) { + this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, + resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", + ""); + } + + /** RestIterator class + * compatible to version 1.3.33 + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken) { + this.clientParams = clientParams; + this.jsonUtils = new JsonUtils(); + this.baseUrl = baseUrl; + this.resumptionType = resumptionType; + this.resumptionParam = resumptionParam; + this.resultFormatValue = resultFormatValue; + this.queryParams = queryParams; + this.resultSizeValue = Integer.valueOf(resultSizeValueStr); + this.authMethod = authMethod; + this.authToken = authToken; + + queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue + : ""; + querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; + + try { + initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); + } catch (Exception e) { + throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); + } + initQueue(); + } + + private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) + throws TransformerConfigurationException, XPathExpressionException { + transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + xpath = XPathFactory.newInstance().newXPath(); + xprResultTotalPath = xpath.compile(resultTotalXpath); + xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); + xprEntity = xpath.compile(entityXpath); + } + + private void initQueue() { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + } + + private void disconnect() { + // TODO close inputstream + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + @Override + public boolean hasNext() { + if (recordQueue.isEmpty() && query.isEmpty()) { + disconnect(); + return false; + } else { + return true; + } + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public String next() { + synchronized (recordQueue) { + while (recordQueue.isEmpty() && !query.isEmpty()) { + try { + log.debug("get Query: " + query); + query = downloadPage(query); + log.debug("next queryURL from downloadPage(): " + query); + } catch (CollectorException e) { + log.debug("CollectorPlugin.next()-Exception: " + e); + throw new RuntimeException(e); + } + } + return recordQueue.poll(); + } + } + + /* + * download page and return nextQuery + */ + private String downloadPage(String query) throws CollectorException { + String resultJson; + String resultXml = ""; + String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; + Node resultNode = null; + NodeList nodeList = null; + InputStream theHttpInputStream; + + // check if cursor=* is initial set otherwise add it to the queryParam URL + if (resumptionType.equalsIgnoreCase("deep-cursor")) { + log.debug("check resumptionType deep-cursor and check cursor=*?" + query); + if (!query.contains("&cursor=")) { + query += "&cursor=*"; + } + } + + try { + URL qUrl = new URL(query); + log.debug("authMethod :" + authMethod); + if (this.authMethod == "bearer") { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken); + conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/json"); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else if (BASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: " + resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken); + conn.setRequestProperty(HttpHeaders.ACCEPT, "application/xml"); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else { + theHttpInputStream = qUrl.openStream(); + } + + resultStream = theHttpInputStream; + if ("json".equalsIgnoreCase(resultFormatValue)) { + resultJson = IOUtils.toString(resultStream, "UTF-8"); + resultXml = jsonUtils.convertToXML(resultJson); + resultStream = IOUtils.toInputStream(resultXml, "UTF-8"); + } + + if (!(emptyXml).equalsIgnoreCase(resultXml)) { + resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); + nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); + log.debug("nodeList.length: " + nodeList.getLength()); + for (int i = 0; i < nodeList.getLength(); i++) { + StringWriter sw = new StringWriter(); + transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); + recordQueue.add(sw.toString()); + } + } else { + log.info("resultXml is equal with emptyXml"); + } + + resumptionInt += resultSizeValue; + + String qUrlArgument = ""; + switch (resumptionType.toLowerCase()) { + case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items + resumptionStr = xprResumptionPath.evaluate(resultNode); + break; + + case "count": // begin at one step for all records, iterate over items + resumptionStr = Integer.toString(resumptionInt); + break; + + case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) + if (resultSizeValue < 2) { + throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); + } + qUrlArgument = qUrl.getQuery(); + String[] arrayQUrlArgument = qUrlArgument.split("&"); + int urlOldResumptionSize = 0; + for (String arrayUrlArgStr : arrayQUrlArgument) { + if (arrayUrlArgStr.startsWith(resumptionParam)) { + String[] resumptionKeyValue = arrayUrlArgStr.split("="); + if (isInteger(resumptionKeyValue[1])) { + urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); + log.debug("discover OldResumptionSize from Url (int): " + urlOldResumptionSize); + } else { + log.debug("discover OldResumptionSize from Url (str): " + resumptionKeyValue[1]); + } + } + } + + if (((emptyXml).equalsIgnoreCase(resultXml)) + || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) { + // resumptionStr = ""; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + resultTotal = discoverResultSize; + } else { + resumptionStr = Integer.toString(resumptionInt); + resultTotal = resumptionInt + 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + } + log.debug("discoverResultSize: " + discoverResultSize); + break; + + case "pagination": + case "page": // pagination, iterate over page numbers + pagination += 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } else { + resultTotal = discoverResultSize; + pagination = discoverResultSize; + } + resumptionInt = pagination; + resumptionStr = Integer.toString(resumptionInt); + break; + + case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in + // solr) + // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: + // deep-cursor, Param 'resultSizeValue' is less than 2");} + + resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); + queryParams = queryParams.replace("&cursor=*", ""); + + // terminating if length of nodeList is 0 + if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) { + resumptionInt += (nodeList.getLength() + 1 - resultSizeValue); + } else { + resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue + // because the iteration is over + // real length and the + // resultSizeValue is added before + // the switch() + } + + discoverResultSize = nodeList.getLength(); + + log + .debug( + "downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + + queryParams + " resumptionLengthIncreased: " + resumptionInt); + + break; + + default: // otherwise: abort + // resultTotal = resumptionInt; + break; + } + + } catch (Exception e) { + log.error(e); + throw new IllegalStateException("collection failed: " + e.getMessage()); + } + + try { + if (resultTotal == -1) { + resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); + if (resumptionType.toLowerCase().equals("page") && !BASIC.equalsIgnoreCase(authMethod)) { + resultTotal += 1; + } // to correct the upper bound + log.info("resultTotal was -1 is now: " + resultTotal); + } + } catch (Exception e) { + log.error(e); + throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); + } + log.debug("resultTotal: " + resultTotal); + log.debug("resInt: " + resumptionInt); + String nextQuery; + if (resumptionInt <= resultTotal) { + nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + + queryFormat; + } else { + nextQuery = ""; + // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the + // resumptionInt and prevent a NullPointer Exception at mdStore + } + log.debug("nextQueryUrl: " + nextQuery); + return nextQuery; + } + + private boolean isInteger(String s) { + boolean isValidInteger = false; + try { + Integer.parseInt(s); + + // s is a valid integer + + isValidInteger = true; + } catch (NumberFormatException ex) { + // s is not an integer + } + + return isValidInteger; + } + + // Method to encode a string value using `UTF-8` encoding scheme + private String encodeValue(String value) { + try { + return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex.getCause()); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java new file mode 100644 index 000000000..648ac85fb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPluginTest.java @@ -0,0 +1,81 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; + +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregatorReport; +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * @author js, Andreas Czerniak + * + */ +public class RestCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String entityXpath = "//hits/hits"; + private String resumptionXpath = "//hits"; + private String resultTotalXpath = "//hits/total"; + private String resultFormatParam = "format"; + private String resultFormatValue = "json"; + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + // private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + private String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29"; + // private String query = "=(sources:engrXiv AND type:preprint)"; + + private String protocolDescriptor = "rest_json2xml"; + private ApiDescriptor api = new ApiDescriptor(); + private RestCollectorPlugin rcp; + + @BeforeEach + public void setUp() { + HashMap params = new HashMap<>(); + params.put("resumptionType", resumptionType); + params.put("resumptionParam", resumptionParam); + params.put("resumptionXpath", resumptionXpath); + params.put("resultTotalXpath", resultTotalXpath); + params.put("resultFormatParam", resultFormatParam); + params.put("resultFormatValue", resultFormatValue); + params.put("resultSizeParam", resultSizeParam); + params.put("resultSizeValue", resultSizeValue); + params.put("queryParams", query); + params.put("entityXpath", entityXpath); + + api.setBaseUrl(baseUrl); + api.setParams(params); + + rcp = new RestCollectorPlugin(new HttpClientParams()); + } + + @Disabled + @Test + public void test() throws CollectorException { + AtomicInteger i = new AtomicInteger(0); + final Stream stream = rcp.collect(api, new AggregatorReport()); + + stream.limit(200).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + i.incrementAndGet(); + log.info(s); + }); + + log.info("{}", i.intValue()); + Assertions.assertTrue(i.intValue() > 0); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java new file mode 100644 index 000000000..16604e0eb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/RestIteratorTest.java @@ -0,0 +1,54 @@ +/** + * + */ + +package eu.dnetlib.dhp.collection.plugin.rest; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.HttpClientParams; + +/** + * + * @author js, Andreas Czerniak + * @date 2020-04-08 + */ +public class RestIteratorTest { + + private static final Logger log = LoggerFactory.getLogger(RestIteratorTest.class); + + private String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search"; + private String resumptionType = "count"; + private String resumptionParam = "from"; + private String resumptionXpath = ""; + private String resultTotalXpath = "//hits/total"; + private String entityXpath = "//hits/hits"; + private String resultFormatParam = "format"; + private String resultFormatValue = "Json"; // Change from lowerCase to one UpperCase + private String resultSizeParam = "size"; + private String resultSizeValue = "10"; + private String authMethod = ""; + private String authToken = ""; + private String resultOffsetParam = "cursor"; + private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29"; + + @Disabled + @Test + public void test() { + + HttpClientParams clientParams = new HttpClientParams(); + + final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam, + resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue, + query, entityXpath, authMethod, authToken, resultOffsetParam); + int i = 20; + while (iterator.hasNext() && i > 0) { + String result = iterator.next(); + + i--; + } + } +} diff --git a/pom.xml b/pom.xml index 45bb6bf78..5c45fad5f 100644 --- a/pom.xml +++ b/pom.xml @@ -461,6 +461,12 @@ ${apache.poi.version} + + org.json + json + 20180813 + + org.json4s json4s-jackson_2.11 From 61a2551e74d18f35cfbfe3043e51899d478e3285 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 15 Mar 2021 17:17:55 +0100 Subject: [PATCH 71/86] migrated last changes from svn (dnet45) --- .../plugin/rest/RestCollectorPlugin.java | 19 +- .../collection/plugin/rest/RestIterator.java | 166 +++++++----------- 2 files changed, 83 insertions(+), 102 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java index ad8bfa4ea..e59db143a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestCollectorPlugin.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import java.util.Optional; import java.util.Spliterator; import java.util.Spliterators; import java.util.stream.Stream; @@ -23,6 +24,8 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; */ public class RestCollectorPlugin implements CollectorPlugin { + public static final String RESULT_SIZE_VALUE_DEFAULT = "100"; + private HttpClientParams clientParams; public RestCollectorPlugin(HttpClientParams clientParams) { @@ -32,6 +35,7 @@ public class RestCollectorPlugin implements CollectorPlugin { @Override public Stream collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException { final String baseUrl = api.getBaseUrl(); + final String resumptionType = api.getParams().get("resumptionType"); final String resumptionParam = api.getParams().get("resumptionParam"); final String resumptionXpath = api.getParams().get("resumptionXpath"); @@ -39,12 +43,14 @@ public class RestCollectorPlugin implements CollectorPlugin { final String resultFormatParam = api.getParams().get("resultFormatParam"); final String resultFormatValue = api.getParams().get("resultFormatValue"); final String resultSizeParam = api.getParams().get("resultSizeParam"); - final String resultSizeValue = (StringUtils.isBlank(api.getParams().get("resultSizeValue"))) ? "100" - : api.getParams().get("resultSizeValue"); final String queryParams = api.getParams().get("queryParams"); final String entityXpath = api.getParams().get("entityXpath"); final String authMethod = api.getParams().get("authMethod"); final String authToken = api.getParams().get("authToken"); + final String resultSizeValue = Optional + .ofNullable(api.getParams().get("resultSizeValue")) + .filter(StringUtils::isNotBlank) + .orElse(RESULT_SIZE_VALUE_DEFAULT); if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); @@ -65,6 +71,12 @@ public class RestCollectorPlugin implements CollectorPlugin { throw new CollectorException("Param 'entityXpath' is null or empty"); } + final String resultOutputFormat = Optional + .ofNullable(api.getParams().get("resultOutputFormat")) + .map(String::toLowerCase) + .filter(StringUtils::isNotBlank) + .orElse(resultFormatValue.toLowerCase()); + RestIterator it = new RestIterator( getClientParams(), baseUrl, @@ -79,7 +91,8 @@ public class RestCollectorPlugin implements CollectorPlugin { queryParams, entityXpath, authMethod, - authToken); + authToken, + resultOutputFormat); return StreamSupport .stream( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index b728293d5..fdefa67b8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -1,6 +1,27 @@ package eu.dnetlib.dhp.collection.plugin.rest; +import eu.dnetlib.dhp.collection.CollectorException; +import eu.dnetlib.dhp.collection.HttpClientParams; +import eu.dnetlib.dhp.collection.JsonUtils; +import org.apache.avro.test.http.Http; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.http.HttpHeaders; +import org.apache.http.entity.ContentType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import javax.xml.xpath.*; import java.io.InputStream; import java.io.StringWriter; import java.io.UnsupportedEncodingException; @@ -12,30 +33,8 @@ import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerConfigurationException; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.*; - -import org.apache.commons.httpclient.HttpMethod; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.http.HttpHeaders; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; - -import eu.dnetlib.dhp.collection.CollectorException; -import eu.dnetlib.dhp.collection.HttpClientParams; -import eu.dnetlib.dhp.collection.JsonUtils; - /** - * log.debug(...) equal to log.trace(...) in the application-logs + * log.info(...) equal to log.trace(...) in the application-logs *

* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue * @@ -45,7 +44,8 @@ import eu.dnetlib.dhp.collection.JsonUtils; */ public class RestIterator implements Iterator { - private static final Log log = LogFactory.getLog(RestIterator.class); + private static final Logger log = LoggerFactory.getLogger(RestIterator.class); + public static final String UTF_8 = "UTF-8"; private HttpClientParams clientParams; @@ -74,65 +74,15 @@ public class RestIterator implements Iterator { private String querySize; private String authMethod; private String authToken; - private final Queue recordQueue = new PriorityBlockingQueue(); + private Queue recordQueue = new PriorityBlockingQueue(); private int discoverResultSize = 0; private int pagination = 1; - - /** - * RestIterator class - * - * compatible to version before 1.3.33 - * - * @param baseUrl - * @param resumptionType - * @param resumptionParam - * @param resumptionXpath - * @param resultTotalXpath - * @param resultFormatParam - * @param resultFormatValue - * @param resultSizeParam - * @param resultSizeValueStr - * @param queryParams - * @param entityXpath + /* + * While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in + * json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in + * json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format. */ - public RestIterator( - final HttpClientParams clientParams, - final String baseUrl, - final String resumptionType, - final String resumptionParam, - final String resumptionXpath, - final String resultTotalXpath, - final String resultFormatParam, - final String resultFormatValue, - final String resultSizeParam, - final String resultSizeValueStr, - final String queryParams, - final String entityXpath) { - this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, - resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", - ""); - } - - public RestIterator( - final HttpClientParams clientParams, - final String baseUrl, - final String resumptionType, - final String resumptionParam, - final String resumptionXpath, - final String resultTotalXpath, - final String resultFormatParam, - final String resultFormatValue, - final String resultSizeParam, - final String resultSizeValueStr, - final String queryParams, - final String entityXpath, - final String authMethod, - final String authToken, - final String resultOffsetParam) { - this(clientParams, baseUrl, resumptionType, resumptionParam, resumptionXpath, resultTotalXpath, - resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValueStr, queryParams, entityXpath, "", - ""); - } + private String resultOutputFormat; /** RestIterator class * compatible to version 1.3.33 @@ -151,17 +101,20 @@ public class RestIterator implements Iterator { final String queryParams, final String entityXpath, final String authMethod, - final String authToken) { + final String authToken, + final String resultOutputFormat) { + this.clientParams = clientParams; this.jsonUtils = new JsonUtils(); this.baseUrl = baseUrl; this.resumptionType = resumptionType; this.resumptionParam = resumptionParam; this.resultFormatValue = resultFormatValue; - this.queryParams = queryParams; this.resultSizeValue = Integer.valueOf(resultSizeValueStr); + this.queryParams = queryParams; this.authMethod = authMethod; this.authToken = authToken; + this.resultOutputFormat = resultOutputFormat; queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue : ""; @@ -188,6 +141,7 @@ public class RestIterator implements Iterator { private void initQueue() { query = baseUrl + "?" + queryParams + querySize + queryFormat; + log.info("REST calls starting with " + query); } private void disconnect() { @@ -217,9 +171,7 @@ public class RestIterator implements Iterator { synchronized (recordQueue) { while (recordQueue.isEmpty() && !query.isEmpty()) { try { - log.debug("get Query: " + query); query = downloadPage(query); - log.debug("next queryURL from downloadPage(): " + query); } catch (CollectorException e) { log.debug("CollectorPlugin.next()-Exception: " + e); throw new RuntimeException(e); @@ -235,9 +187,12 @@ public class RestIterator implements Iterator { private String downloadPage(String query) throws CollectorException { String resultJson; String resultXml = ""; + String nextQuery = ""; String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; Node resultNode = null; NodeList nodeList = null; + String qUrlArgument = ""; + int urlOldResumptionSize = 0; InputStream theHttpInputStream; // check if cursor=* is initial set otherwise add it to the queryParam URL @@ -249,20 +204,22 @@ public class RestIterator implements Iterator { } try { + log.info("requestig URL [{}]", query); + URL qUrl = new URL(query); log.debug("authMethod :" + authMethod); - if (this.authMethod == "bearer") { + if ("bearer".equalsIgnoreCase(this.authMethod)) { log.trace("authMethod before inputStream: " + resultXml); HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken); - conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/json"); + conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); conn.setRequestMethod("GET"); theHttpInputStream = conn.getInputStream(); } else if (BASIC.equalsIgnoreCase(this.authMethod)) { log.trace("authMethod before inputStream: " + resultXml); HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken); - conn.setRequestProperty(HttpHeaders.ACCEPT, "application/xml"); + conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); conn.setRequestMethod("GET"); theHttpInputStream = conn.getInputStream(); } else { @@ -270,10 +227,10 @@ public class RestIterator implements Iterator { } resultStream = theHttpInputStream; - if ("json".equalsIgnoreCase(resultFormatValue)) { - resultJson = IOUtils.toString(resultStream, "UTF-8"); + if ("json".equals(resultOutputFormat)) { + resultJson = IOUtils.toString(resultStream, UTF_8); resultXml = jsonUtils.convertToXML(resultJson); - resultStream = IOUtils.toInputStream(resultXml, "UTF-8"); + resultStream = IOUtils.toInputStream(resultXml, UTF_8); } if (!(emptyXml).equalsIgnoreCase(resultXml)) { @@ -283,15 +240,19 @@ public class RestIterator implements Iterator { for (int i = 0; i < nodeList.getLength(); i++) { StringWriter sw = new StringWriter(); transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); - recordQueue.add(sw.toString()); + String toEnqueue = sw.toString(); + if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) { + log.warn("The following record resulted in empty item for the feeding queue: " + resultXml); + } else { + recordQueue.add(sw.toString()); + } } } else { - log.info("resultXml is equal with emptyXml"); + log.warn("resultXml is equal with emptyXml"); } resumptionInt += resultSizeValue; - String qUrlArgument = ""; switch (resumptionType.toLowerCase()) { case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items resumptionStr = xprResumptionPath.evaluate(resultNode); @@ -307,7 +268,6 @@ public class RestIterator implements Iterator { } qUrlArgument = qUrl.getQuery(); String[] arrayQUrlArgument = qUrlArgument.split("&"); - int urlOldResumptionSize = 0; for (String arrayUrlArgStr : arrayQUrlArgument) { if (arrayUrlArgStr.startsWith(resumptionParam)) { String[] resumptionKeyValue = arrayUrlArgStr.split("="); @@ -334,7 +294,7 @@ public class RestIterator implements Iterator { discoverResultSize += nodeList.getLength(); } } - log.debug("discoverResultSize: " + discoverResultSize); + log.info("discoverResultSize: {}", discoverResultSize); break; case "pagination": @@ -384,25 +344,24 @@ public class RestIterator implements Iterator { } } catch (Exception e) { - log.error(e); + log.error(e.getMessage(), e); throw new IllegalStateException("collection failed: " + e.getMessage()); } try { if (resultTotal == -1) { resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); - if (resumptionType.toLowerCase().equals("page") && !BASIC.equalsIgnoreCase(authMethod)) { + if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) { resultTotal += 1; } // to correct the upper bound log.info("resultTotal was -1 is now: " + resultTotal); } } catch (Exception e) { - log.error(e); + log.error(e.getMessage(), e); throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); } log.debug("resultTotal: " + resultTotal); log.debug("resInt: " + resumptionInt); - String nextQuery; if (resumptionInt <= resultTotal) { nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + queryFormat; @@ -413,6 +372,7 @@ public class RestIterator implements Iterator { } log.debug("nextQueryUrl: " + nextQuery); return nextQuery; + } private boolean isInteger(String s) { @@ -439,4 +399,12 @@ public class RestIterator implements Iterator { } } + public String getResultFormatValue() { + return resultFormatValue; + } + + public String getResultOutputFormat() { + return resultOutputFormat; + } + } From 098914dcff4fa9a01d229ff610a656853c54602a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 22 Mar 2021 11:35:02 +0100 Subject: [PATCH 72/86] fix wrong relation with source null --- .../SparkGenerateDOIBoostActionSet.scala | 16 +++++++++------- .../dnetlib/doiboost/crossref/Crossref2Oaf.scala | 13 +++++++------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index 78477ae4d..21d3454da 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -38,37 +38,39 @@ object SparkGenerateDOIBoostActionSet { val crossRefRelation = parser.get("crossRefRelation") val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath") val dbOrganizationPath = parser.get("dbOrganizationPath") - val workingDirPath = parser.get("targetPath") val sequenceFilePath = parser.get("sFilePath") val asDataset = spark.read.load(dbDatasetPath).as[OafDataset] + .filter(p => p != null || p.getId != null) .map(d =>DoiBoostMappingUtil.fixResult(d)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet") + val asPublication =spark.read.load(dbPublicationPath).as[Publication] + .filter(p => p != null || p.getId != null) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asOrganization = spark.read.load(dbOrganizationPath).as[Organization] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asCRelation = spark.read.load(crossRefRelation).as[Relation] + .filter(r => r!= null || (r.getSource != null && r.getTarget != null)) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation] .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) -// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet") + val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) -// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)] + d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 43b3f7e1c..b051177f5 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -15,7 +15,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex -import eu.dnetlib.dhp.schema.scholexplorer.OafUtils; +import eu.dnetlib.dhp.schema.scholexplorer.OafUtils case class CrossrefDT(doi: String, json:String, timestamp: Long) {} @@ -182,7 +182,7 @@ case object Crossref2Oaf { // Ticket #6281 added pid to Instance instance.setPid(result.getPid) - val has_review = (json \ "relation" \"has-review" \ "id") + val has_review = json \ "relation" \"has-review" \ "id" if(has_review != JNothing) { instance.setRefereed( @@ -208,8 +208,9 @@ case object Crossref2Oaf { instance.setUrl(links.asJava) result.setId(IdentifierFactory.createDOIBoostIdentifier(result)) if (result.getId== null) - return null - result + null + else + result } @@ -241,9 +242,9 @@ case object Crossref2Oaf { val result = generateItemFromType(objectType, objectSubType) if (result == null) return List() - val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")); + val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")) mappingResult(result, json, cOBJCategory) - if (result == null) + if (result == null || result.getId == null) return List() From c392936b9743e232a28f1b219d55b4322761da2a Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 23 Mar 2021 09:23:22 +0100 Subject: [PATCH 73/86] fixed error on best access right --- .../dnetlib/doiboost/DoiBoostMappingUtil.scala | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 5bd8d6636..03f6653c7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -107,7 +107,7 @@ object DoiBoostMappingUtil { def fixResult(result: Dataset) :Dataset = { - val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + val instanceType = extractInstance(result) if (instanceType.isDefined) { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } @@ -135,6 +135,11 @@ object DoiBoostMappingUtil { } + + def extractInstance(r:Result):Option[Instance] = { + r.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + } + def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = { val publication = input._1._2 @@ -142,7 +147,7 @@ object DoiBoostMappingUtil { val item = if (input._2 != null) input._2._2 else null - val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty) + val instanceType:Option[Instance] = extractInstance(publication) if (instanceType.isDefined) { publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) @@ -156,7 +161,8 @@ object DoiBoostMappingUtil { hb.setKey(generateDSId(item.id)) if (item.openAccess) i.setAccessright(getOpenAccessQualifier()) - publication.setBestaccessright(getOpenAccessQualifier()) + val ar = getOpenAccessQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { hb.setValue("Unknown Repository") @@ -168,10 +174,12 @@ object DoiBoostMappingUtil { val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid) if (ar.nonEmpty) { if(ar.contains("OPEN")){ - publication.setBestaccessright(getOpenAccessQualifier()) + val ar = getOpenAccessQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { - publication.setBestaccessright(getRestrictedQualifier()) + val ar = getRestrictedQualifier() + publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } } publication From 431cbe9955530806bd202cf76b30358ea55f7d6a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 09:28:58 +0100 Subject: [PATCH 74/86] handle missing instance.pid during bulk cleaning --- .../dhp/schema/oaf/CleaningFunctions.java | 26 +++++++++++-------- .../oa/graph/clean/CleaningFunctionTest.java | 3 ++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 1cee3058e..412ed408e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -149,17 +149,21 @@ public class CleaningFunctions { if (Objects.nonNull(r.getInstance())) { for (Instance i : r.getInstance()) { - final Set pids = Sets.newHashSet(i.getPid()); - i - .setAlternateIdentifier( - Optional - .ofNullable(i.getAlternateIdentifier()) - .map( - altId -> altId - .stream() - .filter(p -> !pids.contains(p)) - .collect(Collectors.toList())) - .orElse(Lists.newArrayList())); + Optional + .ofNullable(i.getPid()) + .ifPresent(pid -> { + final Set pids = Sets.newHashSet(i.getPid()); + i + .setAlternateIdentifier( + Optional + .ofNullable(i.getAlternateIdentifier()) + .map( + altId -> altId + .stream() + .filter(p -> !pids.contains(p)) + .collect(Collectors.toList())) + .orElse(Lists.newArrayList())); + }); if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { i diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 3a7a3ee19..0860c8bde 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -19,6 +19,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -78,7 +79,7 @@ public class CleaningFunctionTest { assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); - Set pidTerms = vocabularies.getTerms("dnet:pid_types"); + Set pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES); assertTrue( p_out .getPid() From b4febed138ffc5394afabd430820e82239cf3e0f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 09:37:48 +0100 Subject: [PATCH 75/86] updated mapping tests as consequence of the special treatment reserved to Handle PIDs --- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 0ab2403a3..f2786dd9d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -11,6 +11,7 @@ import java.io.IOException; import java.util.List; import java.util.Optional; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -396,11 +397,10 @@ public class MappersTest { assertEquals(1, d.getAuthor().size()); assertEquals(1, d.getSubject().size()); assertEquals(1, d.getInstance().size()); - assertTrue(d.getPid().isEmpty()); - - assertTrue(d.getInstance().get(0).getPid().isEmpty()); - assertEquals(1, d.getInstance().get(0).getAlternateIdentifier().size()); - assertEquals("handle", d.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); + assertNotNull(d.getPid()); + assertEquals(1, d.getPid().size()); + assertTrue(PidType.isValid(d.getPid().get(0).getQualifier().getClassid())); + assertEquals(PidType.handle, PidType.valueOf(d.getPid().get(0).getQualifier().getClassid())); assertNotNull(d.getInstance().get(0).getUrl()); } @@ -451,7 +451,10 @@ public class MappersTest { assertEquals(1, p.getAuthor().size()); assertEquals("OPEN", p.getBestaccessright().getClassid()); - assertTrue(p.getPid().isEmpty()); + assertTrue(p.getPid().size() == 1); + assertTrue(PidType.isValid(p.getPid().get(0).getQualifier().getClassid())); + assertTrue(PidType.handle.equals(PidType.valueOf(p.getPid().get(0).getQualifier().getClassid()))); + assertEquals("hdl:11858/00-1734-0000-0003-EE73-2", p.getPid().get(0).getValue()); assertEquals("dataset", p.getResulttype().getClassname()); assertEquals(1, p.getInstance().size()); assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid()); @@ -461,11 +464,8 @@ public class MappersTest { "http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue()); assertEquals(1, p.getInstance().size()); - assertEquals(1, p.getInstance().get(0).getAlternateIdentifier().size()); - assertEquals("handle", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid()); - assertEquals( - "hdl:11858/00-1734-0000-0003-EE73-2", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue()); - + assertNotNull(p.getInstance().get(0).getAlternateIdentifier()); + assertEquals(0, p.getInstance().get(0).getAlternateIdentifier().size()); assertEquals(1, p.getInstance().get(0).getUrl().size()); } From 625e4c29c417f1153cd2fe10e45a8dbf5d7f6c25 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 23 Mar 2021 09:39:56 +0100 Subject: [PATCH 76/86] added model constants --- .../dnetlib/doiboost/DoiBoostMappingUtil.scala | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 03f6653c7..f2c63cee6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instanc import eu.dnetlib.dhp.utils.DHPUtils import org.apache.commons.lang3.StringUtils import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.scholexplorer.OafUtils import org.json4s import org.json4s.DefaultFormats @@ -112,18 +113,12 @@ object DoiBoostMappingUtil { result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype)) } result.getInstance().asScala.foreach(i => { - i.setHostedby(getUnknownHostedBy()) + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) }) result } - def getUnknownHostedBy():KeyValue = { - val hb = new KeyValue - hb.setValue("Unknown Repository") - hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") - hb - } def getOpenAccessQualifier():AccessRight = { @@ -155,7 +150,7 @@ object DoiBoostMappingUtil { publication.getInstance().asScala.foreach(i => { - val hb = new KeyValue + var hb = new KeyValue if (item != null) { hb.setValue(item.officialname) hb.setKey(generateDSId(item.id)) @@ -165,15 +160,14 @@ object DoiBoostMappingUtil { publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } else { - hb.setValue("Unknown Repository") - hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c") + hb = ModelConstants.UNKNOWN_REPOSITORY } i.setHostedby(hb) }) val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid) if (ar.nonEmpty) { - if(ar.contains("OPEN")){ + if(ar.contains(ModelConstants.ACCESS_RIGHT_OPEN)){ val ar = getOpenAccessQualifier() publication.setBestaccessright(OafUtils.createQualifier(ar.getClassid, ar.getClassname, ar.getSchemeid, ar.getSchemename)) } From 8db248aa13cf1fda9882b82fa84e926adc934856 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 09:56:34 +0100 Subject: [PATCH 77/86] avoiding error on jenkins compilations: java.net.BindException: Cannot assign requested address: Service 'sparkDriver' failed after 16 retries (on a random free port)! --- .../transformation/TransformationJobTest.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 62a5223d9..f3a0685ac 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -34,9 +34,16 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @ExtendWith(MockitoExtension.class) public class TransformationJobTest extends AbstractVocabularyTest { + private SparkConf sparkConf; + @BeforeEach public void setUp() throws IOException, ISLookUpException { setUpVocabulary(); + + sparkConf = new SparkConf(); + sparkConf.setMaster("local[*]"); + sparkConf.set("spark.driver.host", "localhost"); + sparkConf.set("spark.ui.enabled", "false"); } @Test @@ -124,11 +131,7 @@ public class TransformationJobTest extends AbstractVocabularyTest { @DisplayName("Test TransformSparkJobNode.main with oaiOpenaire_datacite (v4)") public void transformTestITGv4OAIdatacite(@TempDir Path testDir) throws Exception { - SparkConf conf = new SparkConf(); - conf.setAppName(TransformationJobTest.class.getSimpleName()); - conf.setMaster("local"); - - try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + try (SparkSession spark = SparkSession.builder().config(sparkConf).getOrCreate()) { final String mdstore_input = this .getClass() @@ -190,11 +193,7 @@ public class TransformationJobTest extends AbstractVocabularyTest { @DisplayName("Test TransformSparkJobNode.main") public void transformTest(@TempDir Path testDir) throws Exception { - SparkConf conf = new SparkConf(); - conf.setAppName(TransformationJobTest.class.getSimpleName()); - conf.setMaster("local"); - - try (SparkSession spark = SparkSession.builder().config(conf).getOrCreate()) { + try (SparkSession spark = SparkSession.builder().config(sparkConf).getOrCreate()) { final String mdstore_input = this .getClass() From e5ebb500cf7f8d2f804e019467984e77d37d2bed Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 12:13:53 +0100 Subject: [PATCH 78/86] fixed pom versions; included missing workflow modules in dhp-workflows/pom.xml --- dhp-workflows/dhp-stats-promote/pom.xml | 2 +- dhp-workflows/dhp-usage-raw-data-update/pom.xml | 2 +- dhp-workflows/dhp-usage-stats-build/pom.xml | 2 +- dhp-workflows/pom.xml | 3 +++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-promote/pom.xml b/dhp-workflows/dhp-stats-promote/pom.xml index f22c19047..c64c2f58e 100644 --- a/dhp-workflows/dhp-stats-promote/pom.xml +++ b/dhp-workflows/dhp-stats-promote/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-branch_hadoop_aggregator-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-stats-promote diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index 3d01ad847..a78f92d41 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-branch_hadoop_aggregator-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-usage-raw-data-update diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index bf580ed7f..20d2f5b76 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.2.4-branch_hadoop_aggregator-SNAPSHOT + 1.2.4-SNAPSHOT 4.0.0 dhp-usage-stats-build diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 190c9847e..ec8f9268c 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -28,6 +28,9 @@ dhp-graph-provision-scholexplorer dhp-blacklist dhp-stats-update + dhp-stats-promote + dhp-usage-stats-build + dhp-usage-raw-data-update dhp-broker-events dhp-doiboost From 1e423fdc0768ea1a3d1b9a7bee4bd50a77a8f9b7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 13:39:24 +0100 Subject: [PATCH 79/86] [Actionmanager] remove invalid records from the input graph before groupGraphTableByIdAndMerge --- .../actionmanager/promote/PromoteActionPayloadFunctions.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index 56c8dd05a..c0192cddb 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -111,7 +111,9 @@ public class PromoteActionPayloadFunctions { SerializableSupplier> isNotZeroFn, Class rowClazz) { TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); + return rowDS + .filter((FilterFunction) o -> isNotZeroFn.get().apply(o)) .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) .agg(aggregator) .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); From 751125fdf95b9435f5ef86769a9d5b9623cfca14 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 23 Mar 2021 17:34:32 +0100 Subject: [PATCH 80/86] [Actionmanager] zero function considers empty entity.id as well as rel.source/rel.target --- .../promote/PromoteActionPayloadForGraphTableJob.java | 11 ++++++----- .../promote/PromoteActionPayloadFunctions.java | 1 - .../java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index bab4377bd..0052026d4 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import java.io.IOException; -import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -194,7 +194,7 @@ public class PromoteActionPayloadForGraphTableJob { SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget; Dataset joinedAndMerged = PromoteActionPayloadFunctions .joinGraphTableWithActionPayloadAndMerge( @@ -238,12 +238,13 @@ public class PromoteActionPayloadForGraphTableJob { } } - private static Function isNotZeroFnUsingIdOrSource() { + private static Function isNotZeroFnUsingIdOrSourceAndTarget() { return t -> { if (isSubClass(t, Relation.class)) { - return Objects.nonNull(((Relation) t).getSource()); + final Relation rel = (Relation) t; + return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget()); } - return Objects.nonNull(((OafEntity) t).getId()); + return StringUtils.isNotBlank(((OafEntity) t).getId()); }; } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index c0192cddb..d799c646b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -111,7 +111,6 @@ public class PromoteActionPayloadFunctions { SerializableSupplier> isNotZeroFn, Class rowClazz) { TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); - return rowDS .filter((FilterFunction) o -> isNotZeroFn.get().apply(o)) .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index f2786dd9d..c86e31280 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -11,7 +11,6 @@ import java.io.IOException; import java.util.List; import java.util.Optional; -import eu.dnetlib.dhp.schema.oaf.utils.PidType; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -33,6 +32,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) From 827e7e37db7fa9774bb4d692d9ed160ce2837f39 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Mar 2021 11:07:59 +0100 Subject: [PATCH 81/86] [Cleaning] drop instance.alternateIdentifier elements when they are available among instance.pid --- .../dhp/schema/oaf/CleaningFunctions.java | 14 +--- .../oa/graph/clean/CleaningFunctionTest.java | 64 +++++++++++++++++-- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 44 +++++++++++++ 3 files changed, 107 insertions(+), 15 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 412ed408e..afbe0cff6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -152,17 +152,9 @@ public class CleaningFunctions { Optional .ofNullable(i.getPid()) .ifPresent(pid -> { - final Set pids = Sets.newHashSet(i.getPid()); - i - .setAlternateIdentifier( - Optional - .ofNullable(i.getAlternateIdentifier()) - .map( - altId -> altId - .stream() - .filter(p -> !pids.contains(p)) - .collect(Collectors.toList())) - .orElse(Lists.newArrayList())); + final Set pids = Sets.newHashSet(pid); + final Set altIds = Sets.newHashSet(i.getAlternateIdentifier()); + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); }); if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index 0860c8bde..fdbc58c17 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -87,11 +87,67 @@ public class CleaningFunctionTest { .map(p -> p.getQualifier()) .allMatch(q -> pidTerms.contains(q.getClassid()))); - Publication p_defaults = CleaningFunctions.cleanup(p_out); - assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid()); + List poi = p_out.getInstance(); + assertNotNull(poi); + assertEquals(1, poi.size()); + + final Instance poii = poi.get(0); + assertNotNull(poii); + assertNotNull(poii.getPid()); + + assertEquals(2, poii.getPid().size()); + + assertTrue( + poii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); + assertTrue(poii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + + assertNotNull(poii.getAlternateIdentifier()); + assertEquals(2, poii.getAlternateIdentifier().size()); + + assertTrue( + poii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1007/s109090161569x")) + .findFirst() + .isPresent()); + assertTrue( + poii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1009/qwerty")) + .findFirst() + .isPresent()); + + Publication p_cleaned = CleaningFunctions.cleanup(p_out); + assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); - getAuthorPids(p_defaults).forEach(pid -> { + final List pci = p_cleaned.getInstance(); + assertNotNull(pci); + assertEquals(1, pci.size()); + + final Instance pcii = pci.get(0); + assertNotNull(pcii); + assertNotNull(pcii.getPid()); + + assertEquals(2, pcii.getPid().size()); + + assertTrue( + pcii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); + assertTrue(pcii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + + assertNotNull(pcii.getAlternateIdentifier()); + assertEquals(1, pcii.getAlternateIdentifier().size()); + assertTrue( + pcii + .getAlternateIdentifier() + .stream() + .filter(s -> s.getValue().equals("10.1009/qwerty")) + .findFirst() + .isPresent()); + + getAuthorPids(p_cleaned).forEach(pid -> { System.out .println( String @@ -101,7 +157,7 @@ public class CleaningFunctionTest { }); // TODO add more assertions to verity the cleaned values - System.out.println(MAPPER.writeValueAsString(p_out)); + System.out.println(MAPPER.writeValueAsString(p_cleaned)); /* * assertTrue( p_out .getPid() .stream() .allMatch(sp -> StringUtils.isNotBlank(sp.getValue()))); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index e746d236e..23de2ef86 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -318,6 +318,50 @@ "id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375", "instance": [ { + "pid": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1008/abcd" + } + ], + "alternateIdentifier": [ + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1007/s109090161569x" + }, + { + "dataInfo": null, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1009/qwerty" + } + ], "accessright": { "classid": "CLOSED", "classname": "CLOSED", From b5b7dc210401c1a9de68aad3c7ff9212cf355bb6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 26 Mar 2021 12:30:00 +0100 Subject: [PATCH 82/86] [Cleaning] drop alternate identifiers with empty values --- .../java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index afbe0cff6..6c7d3e915 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -152,7 +152,12 @@ public class CleaningFunctions { Optional .ofNullable(i.getPid()) .ifPresent(pid -> { - final Set pids = Sets.newHashSet(pid); + final Set pids = Sets + .newHashSet( + pid + .stream() + .filter(p -> StringUtils.isBlank(p.getValue())) + .collect(Collectors.toList())); final Set altIds = Sets.newHashSet(i.getAlternateIdentifier()); i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); }); From 1dfda3624e6c030ee88c51e7c2f9a526ad1ae3c7 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 26 Mar 2021 13:56:29 +0100 Subject: [PATCH 83/86] improved workflow importing datacite --- .../datacite/AbstractRestClient.scala | 5 +- .../datacite/DataciteAPIImporter.scala | 10 +- .../DataciteToOAFTransformation.scala | 3 +- .../datacite/ImportDatacite.scala | 92 +++++++++++-------- .../datacite/import_from_api.json | 6 ++ .../datacite/oozie_app/workflow.xml | 8 +- .../SparkGenerateDOIBoostActionSet.scala | 2 +- .../crossref/CrossrefMappingTest.scala | 11 +++ 8 files changed, 90 insertions(+), 47 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala index 3c7770075..8df203283 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala @@ -57,10 +57,13 @@ abstract class AbstractRestClient extends Iterator[String]{ private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ val client = HttpClients.createDefault + var tries = 4 try { - var tries = 4 while (tries > 0) { + + println(s"requesting ${r.getURI}") val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") if (response.getStatusLine.getStatusCode > 400) { tries -= 1 } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala index c2ad6855c..36ec9e8c3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.actionmanager.datacite import org.json4s.{DefaultFormats, JValue} import org.json4s.jackson.JsonMethods.{compact, parse, render} -class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10) extends AbstractRestClient { +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { override def extractInfo(input: String): Unit = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats @@ -16,9 +16,15 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10) extends Abstra current_index = 0 } + def get_url():String ={ + val to = if (until> 0) s"$until" else "*" + s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]" + + } + override def getBufferData(): Unit = { if (!complete) { - val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20*]") + val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url()) extractInfo(response) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala index 1ae1f086e..1776a4ad6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala @@ -164,9 +164,8 @@ object DataciteToOAFTransformation { case _: Throwable => try { return Some(LocalDate.parse(a_date, df_it).toString) } catch { - case _: Throwable => try { + case _: Throwable => return None - } } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala index d5edb674a..6cec4ea34 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.actionmanager.datacite +import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} @@ -15,7 +16,7 @@ import org.apache.spark.sql.functions.max import org.slf4j.{Logger, LoggerFactory} import java.time.format.DateTimeFormatter._ -import java.time.{LocalDateTime, ZoneOffset} +import java.time.{LocalDate, LocalDateTime, ZoneOffset} import scala.io.Source object ImportDatacite { @@ -23,21 +24,20 @@ object ImportDatacite { val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) - def convertAPIStringToDataciteItem(input:String): DataciteType = { + def convertAPIStringToDataciteItem(input: String): DataciteType = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json: org.json4s.JValue = parse(input) val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase val isActive = (json \ "attributes" \ "isActive").extract[Boolean] - val timestamp_string = (json \ "attributes" \ "updated").extract[String] + val timestamp_string = (json \ "attributes" \ "updated").extract[String] val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) - DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli/1000, isActive = isActive, json = input) + DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input) } - def main(args: Array[String]): Unit = { val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString) @@ -53,9 +53,13 @@ object ImportDatacite { val dataciteDump = parser.get("dataciteDumpPath") log.info(s"dataciteDump is $dataciteDump") - val hdfsTargetPath =new Path(targetPath) + val hdfsTargetPath = new Path(targetPath) log.info(s"hdfsTargetPath is $hdfsTargetPath") + + val spkipImport = parser.get("skipImport") + log.info(s"skipImport is $spkipImport") + val spark: SparkSession = SparkSession.builder() .appName(ImportDatacite.getClass.getSimpleName) .master(master) @@ -69,7 +73,7 @@ object ImportDatacite { // Because of Maven conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) conf.set("fs.file.impl", classOf[LocalFileSystem].getName) - val sc:SparkContext = spark.sparkContext + val sc: SparkContext = spark.sparkContext sc.setLogLevel("ERROR") import spark.implicits._ @@ -84,14 +88,14 @@ object ImportDatacite { return a if (a == null) return b - if(a.timestamp >b.timestamp) { + if (a.timestamp > b.timestamp) { return a } b } override def merge(a: DataciteType, b: DataciteType): DataciteType = { - reduce(a,b) + reduce(a, b) } override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] @@ -101,69 +105,77 @@ object ImportDatacite { override def finish(reduction: DataciteType): DataciteType = reduction } - val dump:Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] + val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] val ts = dump.select(max("timestamp")).first().getLong(0) - log.info(s"last Timestamp is $ts") + println(s"last Timestamp is $ts") - val cnt = writeSequenceFile(hdfsTargetPath, ts, conf) + val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf) + println(s"Imported from Datacite API $cnt documents") - log.info(s"Imported from Datacite API $cnt documents") + if (cnt > 0) { - if (cnt > 0) { - - val inputRdd:RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) + val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text]) .map(s => s._2.toString) .map(s => convertAPIStringToDataciteItem(s)) spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") - val ds:Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] + val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] dump .union(ds) .groupByKey(_.doi) .agg(dataciteAggregator.toColumn) - .map(s=>s._2) + .map(s => s._2) .repartition(4000) .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated") val fs = FileSystem.get(sc.hadoopConfiguration) fs.delete(new Path(s"$dataciteDump"), true) - fs.rename(new Path(s"${dataciteDump}_updated"),new Path(s"$dataciteDump")) + fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump")) } } - private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration):Long = { - val client = new DataciteAPIImporter(timestamp*1000, 1000) + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration): Long = { + var from:Long = timestamp * 1000 + val delta:Long = 50000000L + var client: DataciteAPIImporter = null + val now :Long =System.currentTimeMillis() var i = 0 try { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) try { - var start: Long = System.currentTimeMillis - var end: Long = 0 - val key: IntWritable = new IntWritable(i) - val value: Text = new Text - while ( { - client.hasNext - }) { - key.set({ - i += 1; - i - 1 - }) - value.set(client.next()) - writer.append(key, value) - writer.hflush() - if (i % 1000 == 0) { - end = System.currentTimeMillis - val time = (end - start) / 1000.0F - println(s"Imported $i in $time seconds") - start = System.currentTimeMillis + while (from < now) { + client = new DataciteAPIImporter(from, 1000, from + delta) + var end: Long = 0 + val key: IntWritable = new IntWritable(i) + val value: Text = new Text + while (client.hasNext) { + key.set({ + i += 1; + i - 1 + }) + value.set(client.next()) + writer.append(key, value) + writer.hflush() + if (i % 1000 == 0) { + end = System.currentTimeMillis + val time = (end - start) / 1000.0F + println(s"Imported $i in $time seconds") + start = System.currentTimeMillis + } } + println(s"updating from value: $from -> ${from+delta}") + from = from + delta } + } catch { + case e: Throwable => + println("Error", e) } finally if (writer != null) writer.close() } i } + } \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json index 967e4445a..69fb039ba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json @@ -12,6 +12,12 @@ "paramDescription": "the path of the Datacite dump", "paramRequired": true }, + { + "paramName": "s", + "paramLongName": "skipImport", + "paramDescription": "avoid to downlaod new items but apply the previous update", + "paramRequired": false + }, { "paramName": "n", "paramLongName": "namenode", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml index 047794c9c..15378c6c7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml @@ -13,6 +13,11 @@ nativeInputPath the path of the input MDStore + + skipimport + false + the path of the input MDStore + @@ -51,6 +56,7 @@ -t${nativeInputPath} -d${mdstoreInputPath} -n${nameNode} + -s${skipimport} --masteryarn-cluster @@ -81,7 +87,7 @@ -tr${isLookupUrl} --masteryarn-cluster - + diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala index 21d3454da..3bfca0859 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/SparkGenerateDOIBoostActionSet.scala @@ -57,7 +57,7 @@ object SparkGenerateDOIBoostActionSet { val asCRelation = spark.read.load(crossRefRelation).as[Relation] - .filter(r => r!= null || (r.getSource != null && r.getTarget != null)) + .filter(r => r!= null && r.getSource != null && r.getTarget != null) .map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index 4568e23a5..cc112528e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -59,6 +59,17 @@ class CrossrefMappingTest { } + @Test + def testSum() :Unit = { + val from:Long = 1613135645000L + val delta:Long = 1000000L + + + println(s"updating from value: $from -> ${from+delta}") + + + } + @Test def testOrcidID() :Unit = { val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString From 48f2b6127ea739a81f3c43be659a45dfa22bcd51 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 29 Mar 2021 14:23:18 +0200 Subject: [PATCH 84/86] [Cleaning] drop alternate identifiers with empty values --- .../dhp/schema/oaf/CleaningFunctions.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 6c7d3e915..3be062c0c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -152,14 +152,20 @@ public class CleaningFunctions { Optional .ofNullable(i.getPid()) .ifPresent(pid -> { - final Set pids = Sets - .newHashSet( + final Set pids = pid .stream() - .filter(p -> StringUtils.isBlank(p.getValue())) - .collect(Collectors.toList())); - final Set altIds = Sets.newHashSet(i.getAlternateIdentifier()); - i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect(Collectors.toCollection(HashSet::new)); + + Optional.ofNullable(i.getAlternateIdentifier()) + .ifPresent(altId -> { + final Set altIds = altId.stream() + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect(Collectors.toCollection(HashSet::new)); + + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + }); }); if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { From a0837ac357b9ae5d3ad5f9161f7a4826cb6b0c73 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 29 Mar 2021 15:59:58 +0200 Subject: [PATCH 85/86] [Stats update] integrating PR#100 for testing https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/100 --- 100.patch | 757 ++++++++++++++++++ .../oa/graph/stats/oozie_app/impala-shell.sh | 18 - .../scripts/computeProductionStats.sql | 8 - .../scripts/updateProductionViews.sql | 207 ----- .../stats/oozie_app/updateProductionViews.sh | 16 + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 41 +- .../dhp/oa/graph/stats/oozie_app/contexts.sh | 43 + .../graph/stats/oozie_app/scripts/step10.sql | 13 - .../graph/stats/oozie_app/scripts/step2.sql | 5 +- .../graph/stats/oozie_app/scripts/step3.sql | 5 +- .../graph/stats/oozie_app/scripts/step4.sql | 5 +- .../graph/stats/oozie_app/scripts/step5.sql | 5 +- .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 + 13 files changed, 874 insertions(+), 266 deletions(-) create mode 100644 100.patch delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh diff --git a/100.patch b/100.patch new file mode 100644 index 000000000..f28cdd0a5 --- /dev/null +++ b/100.patch @@ -0,0 +1,757 @@ +From c5fbad8093ca27deebf1b5fd5ffd39e1877c533d Mon Sep 17 00:00:00 2001 +From: antleb +Date: Thu, 4 Mar 2021 00:42:21 +0200 +Subject: [PATCH 1/8] Contexts are now downloaded instead of using the + stats_ext db + +--- + .../dhp/oa/graph/stats/oozie_app/contexts.sh | 33 +++++++++++++++++++ + .../graph/stats/oozie_app/scripts/step10.sql | 13 -------- + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 17 ++++++++++ + 3 files changed, 50 insertions(+), 13 deletions(-) + create mode 100644 dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +new file mode 100644 +index 00000000..f06a43bb +--- /dev/null ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -0,0 +1,33 @@ ++#!/usr/bin/env bash ++ ++CONTEXT_API=$1 ++TARGET_DB=$2 ++ ++TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv ++cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv ++cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv ++cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv ++cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv ++ ++echo "uploading context data to hdfs" ++hdfs dfs -mkdir ${TMP} ++hdfs dfs -copyFromLocal contexts.csv ${TMP} ++hdfs dfs -copyFromLocal categories.csv ${TMP} ++hdfs dfs -copyFromLocal concepts.csv ${TMP} ++hdfs dfs -chmod -R 777 ${TMP} ++ ++echo "Creating and populating impala tables" ++impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';" ++impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;" ++impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;" ++impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;" ++ ++echo "Cleaning up" ++hdfs dfs -rm -f -r -skipTrash ${TMP} ++ ++echo "Finito!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +index 6c96317e..77fbd3b1 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +@@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS + SELECT * + FROM ${external_stats_db_name}.rndexpediture; + +-CREATE OR REPLACE VIEW ${stats_db_name}.context AS +-SELECT * +-FROM ${external_stats_db_name}.context; +- +-CREATE OR REPLACE VIEW ${stats_db_name}.category AS +-SELECT * +-FROM ${external_stats_db_name}.category; +- +-CREATE OR REPLACE VIEW ${stats_db_name}.concept AS +-SELECT * +-FROM ${external_stats_db_name}.concept; +- +- + ------------------------------------------------------------------------------------------------ + ------------------------------------------------------------------------------------------------ + -- Creation date of the database +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index 9c16f149..afb10c41 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -41,6 +41,10 @@ + hive_timeout + the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + ++ ++ context_api_url ++ the base url of the context api (https://services.openaire.eu/openaire) ++ + + + +@@ -263,6 +267,19 @@ + + + ++ ++ ++ ++ ${jobTracker} ++ ${nameNode} ++ contexts.sh ++ ${context_api_url} ++ ${stats_db_name} ++ contexts.sh ++ ++ ++ ++ + + + +-- +2.17.1 + + +From 6147ee495053634436abe822aaf9ba909813d8c4 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 5 Mar 2021 14:12:18 +0200 +Subject: [PATCH 2/8] assigning correctly hive contexts to concepts + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 7 +++++-- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql | 5 ++++- + .../dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql | 5 ++++- + 5 files changed, 21 insertions(+), 6 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index f06a43bb..6788f88b 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -9,8 +9,8 @@ echo "Downloading context data" + curl ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv + cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv + cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv +-cat contexts.csv | cut -f1 -d, | sed 's/\(.*\)/\1,\1::other,other/' >> categories.csv +-cat categories.csv | cut -d, -f2 | sed 's/\(.*\)/\1,\1::other,other/' >> concepts.csv ++cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv ++cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + + echo "uploading context data to hdfs" + hdfs dfs -mkdir ${TMP} +@@ -29,5 +29,8 @@ impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}. + + echo "Cleaning up" + hdfs dfs -rm -f -r -skipTrash ${TMP} ++rm concepts.csv ++rm categories.csv ++rm contexts.csv + + echo "Finito!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +index 62a15856..75b24b18 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +@@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.publication_concepts AS +-SELECT substr(p.id, 4) as id, contexts.context.id as concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + from ${openaire_db_name}.publication p + LATERAL VIEW explode(p.context) contexts as context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +index dcd5ad85..540cc03a 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.dataset_concepts AS +-SELECT substr(p.id, 4) as id, contexts.context.id as concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + from ${openaire_db_name}.dataset p + LATERAL VIEW explode(p.context) contexts as context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +index fd5390e6..54345e07 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +@@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.software_concepts AS +-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + FROM ${openaire_db_name}.software p + LATERAL VIEW explode(p.context) contexts AS context + where p.datainfo.deletedbyinference = false; +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +index b359b596..36ad5d92 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +@@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance. + where p.datainfo.deletedbyinference = false; + + CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS +-SELECT substr(p.id, 4) AS id, contexts.context.id AS concept ++SELECT substr(p.id, 4) as id, case ++ when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id ++ when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') ++ when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept + FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context + where p.datainfo.deletedbyinference = false; + +-- +2.17.1 + + +From f40c150a0d549e2dbcfd42ecf81e17ad4b505391 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Sat, 6 Mar 2021 00:35:57 +0200 +Subject: [PATCH 3/8] fixed steps... + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index afb10c41..2184cb8a 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -264,7 +264,7 @@ + stats_db_name=${stats_db_name} + openaire_db_name=${openaire_db_name} + +- ++ + + + +@@ -277,7 +277,7 @@ + ${stats_db_name} + contexts.sh + +- ++ + + + +-- +2.17.1 + + +From fa1ec5b5e9b6038b3b565422af5c6406f21220d3 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Wed, 10 Mar 2021 14:05:58 +0200 +Subject: [PATCH 4/8] fixed typo... + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index 2184cb8a..321500e2 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -277,7 +277,7 @@ + ${stats_db_name} + contexts.sh + +- ++ + + + +-- +2.17.1 + + +From 3c75a050443942b632cf8469b5af16a8c61e7569 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 13:47:04 +0200 +Subject: [PATCH 5/8] fixed a ton of typos + +--- + .../scripts/computeProductionStats.sql | 8 ------- + .../stats/oozie_app/updateProductionViews.sh | 18 ++++++++++++++++ + .../dhp/oa/graph/stats/oozie_app/contexts.sh | 21 ++++++++++++------- + 3 files changed, 32 insertions(+), 15 deletions(-) + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql + create mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql +deleted file mode 100644 +index 34e48a18..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql ++++ /dev/null +@@ -1,8 +0,0 @@ +------------------------------------------------------- +------------------------------------------------------- +--- Impala table statistics - Needed to make the tables +--- visible for impala +------------------------------------------------------- +------------------------------------------------------- +- +-INVALIDATE METADATA ${stats_db_name}; +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +new file mode 100644 +index 00000000..57acb2ee +--- /dev/null ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +@@ -0,0 +1,18 @@ ++export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs ++export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) ++if ! [ -L $link_folder ] ++then ++ rm -Rf "$link_folder" ++ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ++fi ++ ++export SOURCE=$1 ++export SHADOW=$2 ++ ++echo "Updating shadow database" ++impala-shell -d ${SOURCE} -q "invalidate metadata" ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - ++impala-shell -q "create database if not exists ${SHADOW}" ++impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - ++echo "Shadow db ready!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index 6788f88b..c28be50d 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -1,4 +1,10 @@ +-#!/usr/bin/env bash ++export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs ++export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) ++if ! [ -L $link_folder ] ++then ++ rm -Rf "$link_folder" ++ ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ++fi + + CONTEXT_API=$1 + TARGET_DB=$2 +@@ -20,12 +26,13 @@ hdfs dfs -copyFromLocal concepts.csv ${TMP} + hdfs dfs -chmod -R 777 ${TMP} + + echo "Creating and populating impala tables" +-impala-shell -c "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ',';" +-impala-shell -c "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context;" +-impala-shell -c "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category;" +-impala-shell -c "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept;" ++impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" ++impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" ++impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" ++impala-shell -d ${TARGET_DB} -q "invalidate metadata" ++impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" ++impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" ++impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" + + echo "Cleaning up" + hdfs dfs -rm -f -r -skipTrash ${TMP} +-- +2.17.1 + + +From 236435b47010ea1ab94c3f018dcf278f5d2c44aa Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 14:11:21 +0200 +Subject: [PATCH 6/8] following redirects + +--- + .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +index c28be50d..29b225e3 100644 +--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh ++++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh +@@ -12,9 +12,9 @@ TARGET_DB=$2 + TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv +-cat contexts.csv | cut -d , -f1 | xargs -I {} curl ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv +-cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv ++curl -L ${CONTEXT_API}/contexts?all=true -H "accept: application/json" | /usr/local/sbin/jq -r '.[] | "\(.id),\(.label)"' > contexts.csv ++cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv ++cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv + cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv + cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + +-- +2.17.1 + + +From 60ebdf2dbe704733809f401df70bffcf49cede29 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 16:34:53 +0200 +Subject: [PATCH 7/8] update promote wf to support monitor&production + +--- + .../oa/graph/stats/oozie_app/impala-shell.sh | 18 -- + .../scripts/updateProductionViews.sql | 207 ------------------ + 2 files changed, 225 deletions(-) + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh + delete mode 100644 dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh +deleted file mode 100644 +index 70112dc7..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh ++++ /dev/null +@@ -1,18 +0,0 @@ +-export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +-export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +-if ! [ -L $link_folder ] +-then +- rm -Rf "$link_folder" +- ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +-fi +- +-echo "Getting file from " $3 +-hdfs dfs -copyToLocal $3 +- +-echo "Running impala shell make the new database visible" +-impala-shell -q "INVALIDATE METADATA;" +- +-echo "Running impala shell to compute new table stats" +-impala-shell -d $1 -f $2 +-echo "Impala shell finished" +-rm $2 +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql +deleted file mode 100644 +index 48f8d58f..00000000 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql ++++ /dev/null +@@ -1,207 +0,0 @@ +------------------------------------------------------- +------------------------------------------------------- +--- Shadow schema table exchange +------------------------------------------------------- +------------------------------------------------------- +- +--- Dropping old views +-DROP VIEW IF EXISTS ${stats_db_production_name}.category; +-DROP VIEW IF EXISTS ${stats_db_production_name}.concept; +-DROP VIEW IF EXISTS ${stats_db_production_name}.context; +-DROP VIEW IF EXISTS ${stats_db_production_name}.country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp; +-DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results; +-DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.funder; +-DROP VIEW IF EXISTS ${stats_db_production_name}.fundref; +-DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects; +-DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics; +-DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture; +-DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources; +-DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics; +- +- +--- Creating the shadow database, in case it doesn't exist +-CREATE database IF NOT EXISTS ${stats_db_production_name}; +- +--- Creating new views +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; +-CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; +-- +2.17.1 + + +From 0ba0a6b9dac25f5ec73e8eafefbf7f91442ad1c5 Mon Sep 17 00:00:00 2001 +From: antleb +Date: Fri, 12 Mar 2021 16:42:59 +0200 +Subject: [PATCH 8/8] update promote wf to support monitor&production + +--- + .../stats/oozie_app/updateProductionViews.sh | 14 +++---- + .../dhp/oa/graph/stats/oozie_app/workflow.xml | 37 ++++++++++++------- + 2 files changed, 29 insertions(+), 22 deletions(-) + +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +index 57acb2ee..3e510e87 100644 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh +@@ -7,12 +7,10 @@ then + fi + + export SOURCE=$1 +-export SHADOW=$2 ++export PRODUCTION=$2 + +-echo "Updating shadow database" +-impala-shell -d ${SOURCE} -q "invalidate metadata" +-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/^\(.*\)/compute stats ${SOURCE}.\1;/" | impala-shell -c -f - +-impala-shell -q "create database if not exists ${SHADOW}" +-impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -c -f - +-impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +-echo "Shadow db ready!" +\ No newline at end of file ++echo "Updating ${PRODUCTION} database" ++impala-shell -q "create database if not exists ${PRODUCTION}" ++impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - ++impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - ++echo "Production db ready!" +\ No newline at end of file +diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +index d744f18d..0d8ff7ee 100644 +--- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml ++++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +@@ -6,7 +6,15 @@ + + + stats_db_production_name +- the name of the production schema ++ the name of the public production schema ++ ++ ++ monitor_db_name ++ the monitor database name ++ ++ ++ monitor_db_production_name ++ the name of the monitor public database + + + stats_tool_api_url +@@ -48,25 +56,26 @@ + + + +- +- ${hive_jdbc_url} +- +- stats_db_name=${stats_db_name} +- stats_db_production_name=${stats_db_production_name} +- +- ++ ++ ${jobTracker} ++ ${nameNode} ++ updateProductionViews.sh ++ ${stats_db_name} ++ ${stats_db_production_name} ++ updateProductionViews.sh ++ ++ + + + +- ++ + + ${jobTracker} + ${nameNode} +- impala-shell.sh +- ${stats_db_production_name} +- computeProductionStats.sql +- ${wf:appPath()}/scripts/computeProductionStats.sql +- impala-shell.sh ++ updateProductionViews.sh ++ ${monitor_db_name} ++ ${monitor_db_production_name} ++ updateProductionViews.sh + + + +-- +2.17.1 + diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh deleted file mode 100644 index 70112dc7b..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/impala-shell.sh +++ /dev/null @@ -1,18 +0,0 @@ -export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs -export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) -if ! [ -L $link_folder ] -then - rm -Rf "$link_folder" - ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} -fi - -echo "Getting file from " $3 -hdfs dfs -copyToLocal $3 - -echo "Running impala shell make the new database visible" -impala-shell -q "INVALIDATE METADATA;" - -echo "Running impala shell to compute new table stats" -impala-shell -d $1 -f $2 -echo "Impala shell finished" -rm $2 diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql deleted file mode 100644 index 34e48a18a..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/computeProductionStats.sql +++ /dev/null @@ -1,8 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Impala table statistics - Needed to make the tables --- visible for impala ------------------------------------------------------- ------------------------------------------------------- - -INVALIDATE METADATA ${stats_db_name}; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql deleted file mode 100644 index 48f8d58fd..000000000 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/updateProductionViews.sql +++ /dev/null @@ -1,207 +0,0 @@ ------------------------------------------------------- ------------------------------------------------------- --- Shadow schema table exchange ------------------------------------------------------- ------------------------------------------------------- - --- Dropping old views -DROP VIEW IF EXISTS ${stats_db_production_name}.category; -DROP VIEW IF EXISTS ${stats_db_production_name}.concept; -DROP VIEW IF EXISTS ${stats_db_production_name}.context; -DROP VIEW IF EXISTS ${stats_db_production_name}.country; -DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp; -DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results; -DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.funder; -DROP VIEW IF EXISTS ${stats_db_production_name}.fundref; -DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects; -DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.project; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_results; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount; -DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.result; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics; -DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture; -DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap; -DROP VIEW IF EXISTS ${stats_db_production_name}.software; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources; -DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics; - - --- Creating the shadow database, in case it doesn't exist -CREATE database IF NOT EXISTS ${stats_db_production_name}; - --- Creating new views -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources; -CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics; diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh new file mode 100644 index 000000000..3e510e87e --- /dev/null +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateProductionViews.sh @@ -0,0 +1,16 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +export SOURCE=$1 +export PRODUCTION=$2 + +echo "Updating ${PRODUCTION} database" +impala-shell -q "create database if not exists ${PRODUCTION}" +impala-shell -d ${PRODUCTION} -q "show tables" --delimited | sed "s/^/drop view if exists ${PRODUCTION}./" | sed "s/$/;/" | impala-shell -c -f - +impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${PRODUCTION}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -c -f - +echo "Production db ready!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index d744f18da..0d8ff7ee3 100644 --- a/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-promote/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -6,7 +6,15 @@ stats_db_production_name - the name of the production schema + the name of the public production schema + + + monitor_db_name + the monitor database name + + + monitor_db_production_name + the name of the monitor public database stats_tool_api_url @@ -48,25 +56,26 @@ - - ${hive_jdbc_url} - - stats_db_name=${stats_db_name} - stats_db_production_name=${stats_db_production_name} - - - - - - ${jobTracker} ${nameNode} - impala-shell.sh + updateProductionViews.sh + ${stats_db_name} ${stats_db_production_name} - computeProductionStats.sql - ${wf:appPath()}/scripts/computeProductionStats.sql - impala-shell.sh + updateProductionViews.sh + + + + + + + + ${jobTracker} + ${nameNode} + updateProductionViews.sh + ${monitor_db_name} + ${monitor_db_production_name} + updateProductionViews.sh diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh new file mode 100644 index 000000000..29b225e3c --- /dev/null +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/contexts.sh @@ -0,0 +1,43 @@ +export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs +export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami) +if ! [ -L $link_folder ] +then + rm -Rf "$link_folder" + ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} +fi + +CONTEXT_API=$1 +TARGET_DB=$2 + +TMP=/tmp/stats-update-`tr -dc A-Za-z0-9 contexts.csv +cat contexts.csv | cut -d , -f1 | xargs -I {} curl -L ${CONTEXT_API}/context/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split(":")[0]),\(.id),\(.label)"' > categories.csv +cat categories.csv | cut -d , -f2 | sed 's/:/%3A/g'| xargs -I {} curl -L ${CONTEXT_API}/context/category/{}/?all=true | /usr/local/sbin/jq -r '.[]|"\(.id|split("::")[0])::\(.id|split("::")[1]),\(.id),\(.label)"' > concepts.csv +cat contexts.csv | sed 's/^\(.*\),\(.*\)/\1,\1::other,\2/' >> categories.csv +cat categories.csv | grep -v ::other | sed 's/^.*,\(.*\),\(.*\)/\1,\1::other,\2/' >> concepts.csv + +echo "uploading context data to hdfs" +hdfs dfs -mkdir ${TMP} +hdfs dfs -copyFromLocal contexts.csv ${TMP} +hdfs dfs -copyFromLocal categories.csv ${TMP} +hdfs dfs -copyFromLocal concepts.csv ${TMP} +hdfs dfs -chmod -R 777 ${TMP} + +echo "Creating and populating impala tables" +impala-shell -q "create table ${TARGET_DB}.context (id string, name string) row format delimited fields terminated by ','" +impala-shell -q "create table ${TARGET_DB}.category (context string, id string, name string) row format delimited fields terminated by ','" +impala-shell -q "create table ${TARGET_DB}.concept (category string, id string, name string) row format delimited fields terminated by ','" +impala-shell -d ${TARGET_DB} -q "invalidate metadata" +impala-shell -q "load data inpath '${TMP}/contexts.csv' into table ${TARGET_DB}.context" +impala-shell -q "load data inpath '${TMP}/categories.csv' into table ${TARGET_DB}.category" +impala-shell -q "load data inpath '${TMP}/concepts.csv' into table ${TARGET_DB}.concept" + +echo "Cleaning up" +hdfs dfs -rm -f -r -skipTrash ${TMP} +rm concepts.csv +rm categories.csv +rm contexts.csv + +echo "Finito!" \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql index 6c96317e6..77fbd3b18 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step10.sql @@ -23,19 +23,6 @@ CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS SELECT * FROM ${external_stats_db_name}.rndexpediture; -CREATE OR REPLACE VIEW ${stats_db_name}.context AS -SELECT * -FROM ${external_stats_db_name}.context; - -CREATE OR REPLACE VIEW ${stats_db_name}.category AS -SELECT * -FROM ${external_stats_db_name}.category; - -CREATE OR REPLACE VIEW ${stats_db_name}.concept AS -SELECT * -FROM ${external_stats_db_name}.concept; - - ------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------ -- Creation date of the database diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql index 62a158560..75b24b189 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql @@ -47,7 +47,10 @@ from ${openaire_db_name}.publication p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.publication_concepts AS -SELECT substr(p.id, 4) as id, contexts.context.id as concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.publication p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql index dcd5ad858..540cc03a5 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step3.sql @@ -54,7 +54,10 @@ FROM ${openaire_db_name}.dataset p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.dataset_concepts AS -SELECT substr(p.id, 4) as id, contexts.context.id as concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.context) contexts as context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql index fd5390e66..54345e074 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step4.sql @@ -54,7 +54,10 @@ FROM ${openaire_db_name}.software p where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.software_concepts AS -SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.software p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql index b359b596f..36ad5d92a 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step5.sql @@ -52,7 +52,10 @@ FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance. where p.datainfo.deletedbyinference = false; CREATE TABLE ${stats_db_name}.otherresearchproduct_concepts AS -SELECT substr(p.id, 4) AS id, contexts.context.id AS concept +SELECT substr(p.id, 4) as id, case + when contexts.context.id RLIKE '^[^::]+::[^::]+::.+$' then contexts.context.id + when contexts.context.id RLIKE '^[^::]+::[^::]+$' then concat(contexts.context.id, '::other') + when contexts.context.id RLIKE '^[^::]+$' then concat(contexts.context.id, '::other::other') END as concept FROM ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.context) contexts AS context where p.datainfo.deletedbyinference = false; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml index 9c16f149d..321500e2c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/workflow.xml @@ -41,6 +41,10 @@ hive_timeout the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds. + + context_api_url + the base url of the context api (https://services.openaire.eu/openaire) + @@ -260,6 +264,19 @@ stats_db_name=${stats_db_name} openaire_db_name=${openaire_db_name} + + + + + + + ${jobTracker} + ${nameNode} + contexts.sh + ${context_api_url} + ${stats_db_name} + contexts.sh + From 3becaa5539aee14592873f90ebfd7275206f4f61 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 29 Mar 2021 16:01:35 +0200 Subject: [PATCH 86/86] [Cleaning] drop alternate identifiers with empty values --- .../main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 3be062c0c..401d5d444 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -155,12 +155,14 @@ public class CleaningFunctions { final Set pids = pid .stream() + .filter(Objects::nonNull) .filter(p -> StringUtils.isNotBlank(p.getValue())) .collect(Collectors.toCollection(HashSet::new)); Optional.ofNullable(i.getAlternateIdentifier()) .ifPresent(altId -> { final Set altIds = altId.stream() + .filter(Objects::nonNull) .filter(p -> StringUtils.isNotBlank(p.getValue())) .collect(Collectors.toCollection(HashSet::new));