From 326c9dc08ceac7613c187f3d3c3609731823f8bc Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 2 Aug 2023 18:05:53 +0200 Subject: [PATCH 01/20] Changes in maven poms to build and test the project using Spark 3.4.x and scala 2.12 --- dhp-common/pom.xml | 19 ++- dhp-pace-core/pom.xml | 12 -- dhp-workflows/dhp-dedup-openaire/pom.xml | 24 --- dhp-workflows/dhp-graph-provision/pom.xml | 14 +- .../oa/provision/utils/TemplateFactory.java | 3 - pom.xml | 145 ++++++++++++------ 6 files changed, 116 insertions(+), 101 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 6198bd81e..d64e7e7a0 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -62,16 +62,17 @@ + + edu.cmu + secondstring + + eu.dnetlib.dhp dhp-pace-core ${project.version} - - org.apache.hadoop - hadoop-common - com.github.sisyphsu dateparser @@ -118,10 +119,6 @@ net.sf.saxon Saxon-HE - - org.slf4j - jcl-over-slf4j - org.apache.cxf cxf-rt-transports-http @@ -129,6 +126,12 @@ eu.dnetlib cnr-rmi-api + + + log4j + log4j + + diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index fd7f44fc9..a6d2538f2 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -53,14 +53,6 @@ edu.cmu secondstring - - com.google.guava - guava - - - com.google.code.gson - gson - org.apache.commons commons-lang3 @@ -85,10 +77,6 @@ com.fasterxml.jackson.core jackson-databind - - org.apache.commons - commons-math3 - com.jayway.jsonpath json-path diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index a271efe8e..2d40f44da 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -54,24 +54,10 @@ dhp-pace-core ${project.version} - org.apache.commons commons-lang3 - - - org.scala-lang.modules - scala-java8-compat_${scala.binary.version} - 1.0.2 - - - - org.scala-lang.modules - scala-collection-compat_${scala.binary.version} - 2.11.0 - - org.apache.spark spark-core_${scala.binary.version} @@ -80,16 +66,10 @@ org.apache.spark spark-sql_${scala.binary.version} - org.apache.spark spark-graphx_${scala.binary.version} - - - com.arakelian - java-jq - dom4j dom4j @@ -102,10 +82,6 @@ com.fasterxml.jackson.core jackson-databind - - com.fasterxml.jackson.core - jackson-core - org.apache.httpcomponents httpclient diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index e62fcdf19..47b056614 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -59,12 +59,6 @@ com.jayway.jsonpath json-path - - - org.slf4j - slf4j-api - - dom4j @@ -160,6 +154,14 @@ org.apache.zookeeper zookeeper + + ant + org.apache.ant + + + antlr4-runtime + org.antlr + diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 87c0261ac..7046b4cf0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -5,14 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import javax.swing.text.html.Option; - import org.apache.commons.lang3.StringUtils; import org.stringtemplate.v4.ST; diff --git a/pom.xml b/pom.xml index 3fd351c1d..fa4f16df3 100644 --- a/pom.xml +++ b/pom.xml @@ -204,10 +204,17 @@ test + + org.slf4j + slf4j-api + ${org.slf4j.version} + provided + + org.slf4j jcl-over-slf4j - 1.7.25 + ${org.slf4j.version} provided @@ -217,22 +224,29 @@ ${dhp.commons.lang.version} + + org.apache.commons + commons-beanutils + ${commons-beanutils.version} + + + commons-validator commons-validator - 1.7 + ${commons-validator.version} com.github.sisyphsu dateparser - 1.0.7 + ${dateparser.version} me.xuender unidecode - 0.0.7 + ${unidecode.version} @@ -245,13 +259,13 @@ commons-codec commons-codec - 1.9 + ${commons-codec.version} commons-io commons-io - 2.4 + ${commons-io.version} @@ -415,6 +429,7 @@ cxf-rt-transports-http 3.1.5 + javax.persistence javax.persistence-api @@ -504,16 +519,11 @@ commons-compress ${common.compress.version} - - org.apache.commons commons-csv ${common.csv.version} - - - org.apache.poi poi-ooxml @@ -568,14 +578,12 @@ provided - org.apache.commons commons-math3 3.6.1 - com.google.code.gson gson @@ -596,7 +604,7 @@ org.reflections reflections - 0.9.10 + ${reflections.version} @@ -610,6 +618,12 @@ icu4j 70.1 + + + org.javassist + javassist + ${javassist.version} + @@ -866,46 +880,62 @@ sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop UTF-8 UTF-8 - 3.6.0 1.8 1.8 - 2.22.2 - 2.0.1 - cdh5.9.2 - 2.6.0-${dhp.cdh.version} - 4.1.0-${dhp.cdh.version} - dhp-schemas - 3.6.0 - 2.4.0.cloudera2 - 2.9.6 - 3.5 - true - 11.0.2 + + 2.11.12 2.11 - 1.3.0 - 5.6.1 - 3.3.3 - 3.4.2 - [2.12,3.0) - [4.17.2] + + + 3.6.0 + 2.22.2 + 2.0.1 + 4.0.1 + + + dhp-schemas + + 4.1.2 + [2.6.1] + 1.20 + 1.8 + 1.8 + 1.9.4 + 1.9 + 3.2.1 + 2.4 + 1.1.3 + 1.7 + 1.0.7 + [3.17.1] + cdh5.9.2 + 3.5 + 11.0.2 + 2.6.0-${dhp.cdh.version} + 2.9.6 + 4.1.0-${dhp.cdh.version} + true + 2.4.0.cloudera2 [4.0.3] [6.0.5] [3.1.6] - [2.6.1] - 7.5.0 - 4.7.2 - 1.20 + 2.2.2 + 3.19.0-GA 3.5.3 4.13.0 - 1.8 - 4.1.2 - 1.8 + 5.6.1 + 3.3.3 + 3.4.2 + 4.7.2 4.5.3 - 4.0.1 - 2.2.2 - 1.1.3 - 3.2.1 + 1.7.25 + 0.9.10 + 1.3.0 + 7.5.0 + 3.6.0 + 0.0.7 + [2.12,3.0) @@ -915,21 +945,40 @@ 2.12 2.12.18 - + 1.3.0 + + 4.8.1 + + 1.22 + 1.8 + 1.10.0 + 1.9.4 + 1.15 + 3.2.2 + 2.11.0 + 1.1.3 + 1.7 + + 14.0.1 + 8.11.0 4.0.2 3.4.1 2.14.2 3.12.0 3.7.0-M11 - 4.8.1 - + 3.25.0-GA + 4.10.0 + 2.0.6 + 0.10.2 + + - \ No newline at end of file + From 2fa78f6071206415b08b00c20a97c6ae8441a0fe Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Thu, 7 Sep 2023 11:58:59 +0200 Subject: [PATCH 02/20] Changes requires to build and run tests with Java 17 --- .../WritePredefinedProjectPropertiesTest.java | 10 ++- .../java/eu/dnetlib/pace/util/UtilTest.java | 4 +- .../oa/dedup/graph/ConnectedComponent.java | 24 +++++--- .../doiboost/orcid/OrcidClientTest.java | 6 -- dhp-workflows/dhp-graph-provision/pom.xml | 61 ++++++++++++++++++- .../dhp/oa/provision/XmlIndexingJob.java | 10 +-- .../dnetlib/dhp/sparksolr/DHPSolrSupport.java | 12 ++++ .../dnetlib/dhp/sparksolr/DHPSolrSupport.java | 12 ++++ .../dhp-usage-raw-data-update/pom.xml | 12 +++- dhp-workflows/dhp-usage-stats-build/pom.xml | 18 +++++- pom.xml | 38 ++++++++++-- 11 files changed, 168 insertions(+), 39 deletions(-) create mode 100644 dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java create mode 100644 dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 84b962b4b..19e9377af 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest { mojo.outputFile = testFolder; // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + try { + mojo.execute(); + Assertions.assertTrue(false); // not reached + } catch (Exception e) { + Assertions + .assertTrue( + MojoExecutionException.class.isAssignableFrom(e.getClass()) || + IllegalArgumentException.class.isAssignableFrom(e.getClass())); + } } @Test diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 6056c342d..c5c5eaba7 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -10,7 +10,6 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import eu.dnetlib.pace.model.Person; -import jdk.nashorn.internal.ir.annotations.Ignore; public class UtilTest { @@ -21,8 +20,7 @@ public class UtilTest { params = new HashMap<>(); } - @Test - @Ignore + // @Test public void paceResolverTest() { PaceResolver paceResolver = new PaceResolver(); paceResolver.getComparator("keywordMatch", params); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index 4a39a175d..4fc0a25e8 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -3,6 +3,9 @@ package eu.dnetlib.dhp.oa.dedup.graph; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -16,14 +19,16 @@ import eu.dnetlib.pace.util.PaceException; public class ConnectedComponent implements Serializable { - private String ccId; - private Set ids; + private String ccId = ""; + private List ids = Collections.EMPTY_LIST; private static final String CONNECTED_COMPONENT_ID_PREFIX = "connect_comp"; - public ConnectedComponent(Set ids, final int cut) { - this.ids = ids; + public ConnectedComponent() { + } + public ConnectedComponent(Set ids, final int cut) { + this.ids = new ArrayList<>(ids); this.ccId = createDefaultID(); if (cut > 0 && ids.size() > cut) { @@ -31,14 +36,15 @@ public class ConnectedComponent implements Serializable { .stream() .filter(id -> !ccId.equalsIgnoreCase(id)) .limit(cut - 1) - .collect(Collectors.toSet()); + .distinct() + .collect(Collectors.toList()); // this.ids.add(ccId); ?? } } public ConnectedComponent(String ccId, Set ids) { this.ccId = ccId; - this.ids = ids; + this.ids = new ArrayList<>(ids); } public String createDefaultID() { @@ -82,12 +88,12 @@ public class ConnectedComponent implements Serializable { } } - public Set getIds() { + public List getIds() { return ids; } - public void setIds(Set ids) { - this.ids = ids; + public void setIds(List ids) { + this.ids =ids; } public String getCcId() { diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java index 70bbd066a..8aebeda0b 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/OrcidClientTest.java @@ -30,7 +30,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.schema.orcid.AuthorData; import eu.dnetlib.doiboost.orcid.util.DownloadsReport; import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector; -import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidClientTest { final int REQ_LIMIT = 24; @@ -152,7 +151,6 @@ public class OrcidClientTest { } // @Test - @Ignore private void testModifiedDate() throws ParseException { testDate(toRetrieveDate); testDate(toNotRetrieveDate); @@ -332,7 +330,6 @@ public class OrcidClientTest { } @Test - @Ignore void testUpdatedRecord() throws Exception { final String base64CompressedRecord = IOUtils .toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64")); @@ -341,7 +338,6 @@ public class OrcidClientTest { } @Test - @Ignore void testUpdatedWork() throws Exception { final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA=="; final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); @@ -413,7 +409,6 @@ public class OrcidClientTest { } @Test - @Ignore void testDownloadedAuthor() throws Exception { final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA="; final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); @@ -421,7 +416,6 @@ public class OrcidClientTest { } @Test - @Ignore void testDownloadedWork() throws Exception { final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA=="; final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork); diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 47b056614..60c925227 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -18,7 +18,7 @@ scala-compile-first - initialize + process-resources add-source compile @@ -208,5 +208,64 @@ + + + scala-2.11 + + true + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/sparksolr-3 + + + + + + + + + + + scala-2.12 + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/sparksolr-4 + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index cd401c6cb..220eb4f53 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -27,12 +27,11 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.lucidworks.spark.util.SolrSupport; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.sparksolr.DHPSolrSupport; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; @@ -156,12 +155,7 @@ public class XmlIndexingJob { switch (outputFormat) { case SOLR: final String collection = ProvisionConstants.getCollectionName(format); - - // SparkSolr >= 4 - // com.lucidworks.spark.BatchSizeType bt = com.lucidworks.spark.BatchSizeType.NUM_DOCS; - // SolrSupport.indexDocs(zkHost, collection, batchSize, bt, docs.rdd()); - // SparkSolr < 4 - SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); + DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); break; case HDFS: spark diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java new file mode 100644 index 000000000..295f0f54d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-3/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java @@ -0,0 +1,12 @@ +package eu.dnetlib.dhp.sparksolr; + +import com.lucidworks.spark.util.SolrSupport; +import org.apache.solr.common.SolrInputDocument; +import org.apache.spark.rdd.RDD; + +public class DHPSolrSupport { + + static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) { + SolrSupport.indexDocs(zkhost, collection, batchSize, docs); + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java new file mode 100644 index 000000000..6b85176a3 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/sparksolr-4/eu/dnetlib/dhp/sparksolr/DHPSolrSupport.java @@ -0,0 +1,12 @@ +package eu.dnetlib.dhp.sparksolr; + +import com.lucidworks.spark.util.SolrSupport; +import org.apache.solr.common.SolrInputDocument; +import org.apache.spark.rdd.RDD; + +public class DHPSolrSupport { + + static public void indexDocs(String zkhost, String collection, int batchSize, RDD docs) { + SolrSupport.indexDocs(zkhost, collection, batchSize, com.lucidworks.spark.BatchSizeType.NUM_DOCS, docs); + } +} diff --git a/dhp-workflows/dhp-usage-raw-data-update/pom.xml b/dhp-workflows/dhp-usage-raw-data-update/pom.xml index a9dbb09ae..8ce9826e2 100644 --- a/dhp-workflows/dhp-usage-raw-data-update/pom.xml +++ b/dhp-workflows/dhp-usage-raw-data-update/pom.xml @@ -39,8 +39,8 @@ UTF-8 UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 + 1.1.0-cdh5.16.2 + 2.6.0-cdh5.16.2 @@ -72,7 +72,13 @@ org.apache.hadoop hadoop-common ${cdh.hadoop.version} - + + + jdk.tools + jdk.tools + + + eu.dnetlib.dhp dhp-common diff --git a/dhp-workflows/dhp-usage-stats-build/pom.xml b/dhp-workflows/dhp-usage-stats-build/pom.xml index 56aec73b7..4dd987f51 100644 --- a/dhp-workflows/dhp-usage-stats-build/pom.xml +++ b/dhp-workflows/dhp-usage-stats-build/pom.xml @@ -39,8 +39,8 @@ UTF-8 UTF-8 - 0.13.1-cdh5.2.1 - 2.5.0-cdh5.2.1 + 1.1.0-cdh5.16.2 + 2.6.0-cdh5.16.2 @@ -67,11 +67,23 @@ org.apache.hive hive-jdbc ${cdh.hive.version} - + + + jdk.tools + jdk.tools + + + org.apache.hadoop hadoop-common ${cdh.hadoop.version} + + + jdk.tools + jdk.tools + + eu.dnetlib.dhp diff --git a/pom.xml b/pom.xml index fa4f16df3..78dda8513 100644 --- a/pom.xml +++ b/pom.xml @@ -120,11 +120,18 @@ conjars conjars - https://conjars.wensel.net/repo/ + https://conjars.wensel.net/repo/ + + + org.projectlombok + lombok + 1.18.28 + provided + org.junit.jupiter junit-jupiter @@ -812,7 +819,7 @@ org.jacoco jacoco-maven-plugin - 0.7.9 + 0.8.10 **/schemas/* @@ -963,8 +970,8 @@ 14.0.1 8.11.0 - 4.0.2 - 3.4.1 + 4.0.4 + 3.4.2-SNAPSHOT 2.14.2 3.12.0 3.7.0-M11 @@ -977,8 +984,29 @@ 3.17.2-SNAPSHOT --> + - + + java17 + + 17 + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M4 + + + --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED + true + + + + + From 8c3e9a09d38fbb9d09d1a72d7bde2183c4a65967 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 18 Sep 2023 12:51:18 +0200 Subject: [PATCH 03/20] added repository openaire-third-parties --- .../WritePredefinedProjectPropertiesTest.java | 2 +- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 18 + .../oa/dedup/graph/ConnectedComponent.java | 2 +- pom.xml | 2047 +++++++++-------- 4 files changed, 1058 insertions(+), 1011 deletions(-) diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 19e9377af..eddcd8867 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -88,7 +88,7 @@ class WritePredefinedProjectPropertiesTest { .assertTrue( MojoExecutionException.class.isAssignableFrom(e.getClass()) || IllegalArgumentException.class.isAssignableFrom(e.getClass())); - } + } } @Test diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index cfd9acd70..154bac62c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1,6 +1,24 @@ package eu.dnetlib.pace.util; +/* + * Diff Match and Patch + * Copyright 2018 The diff-match-patch Authors. + * https://github.com/google/diff-match-patch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Diff Match and Patch * Copyright 2018 The diff-match-patch Authors. diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index 4fc0a25e8..f4b3c441a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -93,7 +93,7 @@ public class ConnectedComponent implements Serializable { } public void setIds(List ids) { - this.ids =ids; + this.ids = ids; } public String getCcId() { diff --git a/pom.xml b/pom.xml index 78dda8513..1480af2a6 100644 --- a/pom.xml +++ b/pom.xml @@ -1,1012 +1,1041 @@ - - - 4.0.0 - eu.dnetlib.dhp - dhp - 1.2.5-SNAPSHOT - pom - - - - GNU Affero General Public License v3.0 or later - https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText - repo - This program is free software: you can redistribute it and/or modify it under the terms of the - GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - - - - dhp-build - dhp-pace-core - dhp-common - dhp-workflows - - - - Redmine - https://support.openaire.eu/projects/openaire - - - - jenkins - https://jenkins-dnet.d4science.org/ - - - - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - https://code-repo.d4science.org/D-Net/dnet-hadoop/ - HEAD - - - This module is the root descriptor for the dnet-hadoop project - - - - - - - dnet45-releases - D-Net 45 releases - https://maven.d4science.org/nexus/content/repositories/dnet45-releases - default - - false - - - true - - - - dnet45-snapshots - D-Net 45 snapshots - https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots - default - - true - - - false - - - - dnet45-bootstrap-snapshot - D-Net 45 Bootstrap Snapshot - https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/ - - false - - - true - - default - - - dnet45-bootstrap-release - D-Net 45 Bootstrap Release - https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/ - - true - - - false - - default - - - cloudera - Cloudera Repository - https://repository.cloudera.com/artifactory/cloudera-repos - - true - - - false - - - - dnet-deps - dnet-dependencies - https://maven.d4science.org/nexus/content/repositories/dnet-deps - default - - - maven-restlet - Restlet repository - https://maven.restlet.talend.com - - - conjars - conjars - https://conjars.wensel.net/repo/ - - - - - - - org.projectlombok - lombok - 1.18.28 - provided - - - org.junit.jupiter - junit-jupiter - ${junit-jupiter.version} - test - - - - org.mockito - mockito-core - ${mockito-core.version} - test - - - - org.mockito - mockito-junit-jupiter - ${mockito-core.version} - test - - - - - - - - eu.dnetlib.dhp - ${dhp-schemas.artifact} - ${dhp-schemas.version} - - - org.apache.hadoop - hadoop-hdfs - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-common - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-distcp - ${dhp.hadoop.version} - provided - - - org.apache.spark - spark-core_${scala.binary.version} - ${dhp.spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${dhp.spark.version} - provided - - - org.apache.spark - spark-graphx_${scala.binary.version} - ${dhp.spark.version} - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - ${dhp.spark.version} - test - - - - org.slf4j - slf4j-api - ${org.slf4j.version} - provided - - - - org.slf4j - jcl-over-slf4j - ${org.slf4j.version} - provided - - - - org.apache.commons - commons-lang3 - ${dhp.commons.lang.version} - - - - org.apache.commons - commons-beanutils - ${commons-beanutils.version} - - - - - commons-validator - commons-validator - ${commons-validator.version} - - - - com.github.sisyphsu - dateparser - ${dateparser.version} - - - - me.xuender - unidecode - ${unidecode.version} - - - - com.google.guava - guava - ${dhp.guava.version} - - - - - commons-codec - commons-codec - ${commons-codec.version} - - - - commons-io - commons-io - ${commons-io.version} - - - - commons-cli - commons-cli - 1.2 - provided - - - - net.sf.saxon - Saxon-HE - 9.9.1-6 - - - - dom4j - dom4j - 1.6.1 - - - - xml-apis - xml-apis - 1.4.01 - - - - jaxen - jaxen - 1.1.6 - - - - com.mycila.xmltool - xmltool - 3.3 - - - - org.apache.solr - solr-solrj - ${solr.version} - - - * - * - - - - - com.lucidworks.spark - spark-solr - ${sparksolr.version} - - - * - * - - - - - org.apache.solr - solr-test-framework - ${solr.version} - test - - - io.dropwizard.metrics - metrics-core - 3.2.6 - test - - - - - org.apache.httpcomponents - httpclient - ${org.apache.httpcomponents.version} - - - org.apache.httpcomponents - httpmime - ${org.apache.httpcomponents.version} - - - org.noggit - noggit - 0.8 - - - org.apache.zookeeper - zookeeper - 3.4.11 - - - - net.schmizz - sshj - 0.10.0 - test - - - - com.fasterxml.jackson.core - jackson-core - ${dhp.jackson.version} - provided - - - - com.fasterxml.jackson.core - jackson-annotations - ${dhp.jackson.version} - provided - - - com.fasterxml.jackson.core - jackson-databind - ${dhp.jackson.version} - provided - - - - eu.dnetlib - dnet-actionmanager-common - ${dnet-actionmanager-common.version} - - - org.apache.hadoop - hadoop-common - - - - - eu.dnetlib - dnet-actionmanager-api - ${dnet-actionmanager-api.version} - - - eu.dnetlib - cnr-misc-utils - - - - - - eu.dnetlib - cnr-rmi-api - ${cnr-rmi-api.version} - - - - eu.dnetlib.dhp - dnet-openaire-broker-common - ${dnet-openaire-broker-common.version} - - - - org.apache.cxf - cxf-rt-transports-http - 3.1.5 - - - - javax.persistence - javax.persistence-api - 2.2 - provided - - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - - - org.mongodb - mongo-java-driver - ${mongodb.driver.version} - - - io.fares.junit.mongodb - mongodb-junit-test - 1.1.0 - - - org.postgresql - postgresql - 42.2.10 - - - - org.antlr - stringtemplate - 3.2.1 - - - - org.antlr - ST4 - 4.3.4 - - - - com.ximpleware - vtd-xml - ${vtd.version} - - - - org.elasticsearch - elasticsearch-hadoop - 7.6.0 - - - - - org.apache.oozie - oozie-client - ${dhp.oozie.version} - provided - - - - slf4j-simple - org.slf4j - - - - - - - com.squareup.okhttp3 - okhttp - ${okhttp.version} - - - - org.apache.commons - commons-compress - ${common.compress.version} - - - org.apache.commons - commons-csv - ${common.csv.version} - - - org.apache.poi - poi-ooxml - ${apache.poi.version} - - - - org.json - json - 20180813 - - - - org.json4s - json4s-jackson_${scala.binary.version} - ${json4s.version} - - - - com.github.victools - jsonschema-generator - ${jsonschemagenerator.version} - - - - org.apache.commons - commons-text - ${common.text.version} - - - - com.opencsv - opencsv - 5.5 - - - io.github.classgraph - classgraph - 4.8.71 - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-xml - ${jackson.version} - provided - - - com.fasterxml.jackson.module - jackson-module-jsonSchema - ${jackson.version} - provided - - - - org.apache.commons - commons-math3 - 3.6.1 - - - - com.google.code.gson - gson - ${google.gson.version} - - - - commons-collections - commons-collections - ${commons.collections.version} - - - commons-logging - commons-logging - ${commons.logging.version} - - - - org.reflections - reflections - ${reflections.version} - - - - org.scala-lang - scala-library - ${scala.version} - - - - com.ibm.icu - icu4j - 70.1 - - - - org.javassist - javassist - ${javassist.version} - - - - - - target - target/classes - ${project.artifactId}-${project.version} - target/test-classes - - - - org.apache.maven.plugins - maven-plugin-plugin - 3.3 - - - org.apache.maven.plugins - maven-project-info-reports-plugin - 3.0.0 - - - org.apache.maven.plugins - maven-site-plugin - 3.9.1 - - ${dhp.site.skip} - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven.compiler.plugin.version} - - 1.8 - 1.8 - ${project.build.sourceEncoding} - - - - - org.apache.maven.plugins - maven-jar-plugin - 3.0.2 - - - - org.apache.maven.plugins - maven-source-plugin - 3.0.1 - - - attach-sources - verify - - jar-no-fork - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M4 - - true - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.2.0 - - true - none - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.6.0 - - - - net.revelc.code.formatter - formatter-maven-plugin - 2.11.0 - - - eu.dnetlib.dhp - dhp-code-style - ${project.version} - - - - - org.antipathy - mvn-scalafmt_${scala.binary.version} - 1.0.1640073709.733712b - - - eu.dnetlib.dhp - dhp-code-style - ${project.version} - - - - - - - - org.apache.maven.plugins - maven-site-plugin - - - org.apache.maven.plugins - maven-project-info-reports-plugin - - - net.revelc.code.formatter - formatter-maven-plugin - - - - format - - - eclipse/formatter_dnet.xml - - - - - - net.revelc.code - impsort-maven-plugin - 1.4.1 - - java.,javax.,org.,com. - java,* - - **/thrift/*.java - - - - - sort-imports - - sort - - - - - - org.antipathy - mvn-scalafmt_${scala.binary.version} - - https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf - false - false - - ${project.basedir}/src/main/scala - - - ${project.basedir}/src/test/scala - - false - false - : git rev-parse --abbrev-ref HEAD - false - - - - validate - - format - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - - org.jacoco - jacoco-maven-plugin - 0.8.10 - - - **/schemas/* - **/com/cloudera/**/* - **/org/apache/avro/io/**/* - - - - - default-prepare-agent - - prepare-agent - - - - default-report - prepare-package - - report - - - - - - - - - - org.apache.maven.wagon - wagon-ssh - 2.10 - - - - - - dnet45-snapshots - DNet45 Snapshots - https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots - default - - - dnet45-releases - https://maven.d4science.org/nexus/content/repositories/dnet45-releases - - - DHPSite - ${dhp.site.stage.path}/ - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - true - none - - - - - - - sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop - UTF-8 - UTF-8 - 1.8 - 1.8 - - - 2.11.12 - 2.11 - - - 3.6.0 - 2.22.2 - 2.0.1 - 4.0.1 - - - dhp-schemas - - 4.1.2 - [2.6.1] - 1.20 - 1.8 - 1.8 - 1.9.4 - 1.9 - 3.2.1 - 2.4 - 1.1.3 - 1.7 - 1.0.7 - [3.17.1] - cdh5.9.2 - 3.5 - 11.0.2 - 2.6.0-${dhp.cdh.version} - 2.9.6 - 4.1.0-${dhp.cdh.version} - true - 2.4.0.cloudera2 - [4.0.3] - [6.0.5] - [3.1.6] - 2.2.2 - 3.19.0-GA - 3.5.3 - 4.13.0 - 5.6.1 - 3.3.3 - 3.4.2 - 4.7.2 - 4.5.3 - 1.7.25 - 0.9.10 - 1.3.0 - 7.5.0 - 3.6.0 - 0.0.7 - [2.12,3.0) - - - - - - scala-2.12 - - 2.12 - 2.12.18 - 1.3.0 - - - 4.8.1 - - - 1.22 - 1.8 - 1.10.0 - 1.9.4 - 1.15 - 3.2.2 - 2.11.0 - 1.1.3 - 1.7 - - 14.0.1 - 8.11.0 + + + 4.0.0 + eu.dnetlib.dhp + dhp + 1.2.5-SNAPSHOT + pom + + + + GNU Affero General Public License v3.0 or later + https://spdx.org/licenses/AGPL-3.0-or-later.html#licenseText + repo + This program is free software: you can redistribute it and/or modify it under the terms of the + GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + + + + + dhp-build + dhp-pace-core + dhp-common + dhp-workflows + + + + Redmine + https://support.openaire.eu/projects/openaire + + + + jenkins + https://jenkins-dnet.d4science.org/ + + + + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + https://code-repo.d4science.org/D-Net/dnet-hadoop/ + HEAD + + + This module is the root descriptor for the dnet-hadoop project + + + + + + + + Openaire-third-parties-snaphot + Openaire third parties Snapshot + https://maven.d4science.org/nexus/content/repositories/Openaire-third-parties-snaphot/ + + false + + + true + + + + + dnet45-releases + D-Net 45 releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + default + + false + + + true + + + + dnet45-snapshots + D-Net 45 snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots + default + + true + + + false + + + + dnet45-bootstrap-snapshot + D-Net 45 Bootstrap Snapshot + https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-snapshot/ + + false + + + true + + default + + + dnet45-bootstrap-release + D-Net 45 Bootstrap Release + https://maven.d4science.org/nexus/content/repositories/dnet45-bootstrap-release/ + + true + + + false + + default + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + dnet-deps + dnet-dependencies + https://maven.d4science.org/nexus/content/repositories/dnet-deps + default + + + maven-restlet + Restlet repository + https://maven.restlet.talend.com + + + conjars + conjars + https://conjars.wensel.net/repo/ + + + + + + + org.projectlombok + lombok + 1.18.28 + provided + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + test + + + + org.mockito + mockito-core + ${mockito-core.version} + test + + + + org.mockito + mockito-junit-jupiter + ${mockito-core.version} + test + + + + + + + + eu.dnetlib.dhp + ${dhp-schemas.artifact} + ${dhp-schemas.version} + + + org.apache.hadoop + hadoop-hdfs + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-common + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-distcp + ${dhp.hadoop.version} + provided + + + org.apache.spark + spark-core_${scala.binary.version} + ${dhp.spark.version} + provided + + + org.apache.spark + spark-sql_${scala.binary.version} + ${dhp.spark.version} + provided + + + org.apache.spark + spark-graphx_${scala.binary.version} + ${dhp.spark.version} + provided + + + org.apache.spark + spark-hive_${scala.binary.version} + ${dhp.spark.version} + test + + + + org.slf4j + slf4j-api + ${org.slf4j.version} + provided + + + + org.slf4j + jcl-over-slf4j + ${org.slf4j.version} + provided + + + + org.apache.commons + commons-lang3 + ${dhp.commons.lang.version} + + + + org.apache.commons + commons-beanutils + ${commons-beanutils.version} + + + + + commons-validator + commons-validator + ${commons-validator.version} + + + + com.github.sisyphsu + dateparser + ${dateparser.version} + + + + me.xuender + unidecode + ${unidecode.version} + + + + com.google.guava + guava + ${dhp.guava.version} + + + + + commons-codec + commons-codec + ${commons-codec.version} + + + + commons-io + commons-io + ${commons-io.version} + + + + commons-cli + commons-cli + 1.2 + provided + + + + net.sf.saxon + Saxon-HE + 9.9.1-6 + + + + dom4j + dom4j + 1.6.1 + + + + xml-apis + xml-apis + 1.4.01 + + + + jaxen + jaxen + 1.1.6 + + + + com.mycila.xmltool + xmltool + 3.3 + + + + org.apache.solr + solr-solrj + ${solr.version} + + + * + * + + + + + com.lucidworks.spark + spark-solr + ${sparksolr.version} + + + * + * + + + + + org.apache.solr + solr-test-framework + ${solr.version} + test + + + io.dropwizard.metrics + metrics-core + 3.2.6 + test + + + + + org.apache.httpcomponents + httpclient + ${org.apache.httpcomponents.version} + + + org.apache.httpcomponents + httpmime + ${org.apache.httpcomponents.version} + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + + + + net.schmizz + sshj + 0.10.0 + test + + + + com.fasterxml.jackson.core + jackson-core + ${dhp.jackson.version} + provided + + + + com.fasterxml.jackson.core + jackson-annotations + ${dhp.jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + ${dhp.jackson.version} + provided + + + + eu.dnetlib + dnet-actionmanager-common + ${dnet-actionmanager-common.version} + + + org.apache.hadoop + hadoop-common + + + + + eu.dnetlib + dnet-actionmanager-api + ${dnet-actionmanager-api.version} + + + eu.dnetlib + cnr-misc-utils + + + + + + eu.dnetlib + cnr-rmi-api + ${cnr-rmi-api.version} + + + + eu.dnetlib.dhp + dnet-openaire-broker-common + ${dnet-openaire-broker-common.version} + + + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + + + + javax.persistence + javax.persistence-api + 2.2 + provided + + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + + + io.fares.junit.mongodb + mongodb-junit-test + 1.1.0 + + + org.postgresql + postgresql + 42.2.10 + + + + org.antlr + stringtemplate + 3.2.1 + + + + org.antlr + ST4 + 4.3.4 + + + + com.ximpleware + vtd-xml + ${vtd.version} + + + + org.elasticsearch + elasticsearch-hadoop + 7.6.0 + + + + + org.apache.oozie + oozie-client + ${dhp.oozie.version} + provided + + + + slf4j-simple + org.slf4j + + + + + + + com.squareup.okhttp3 + okhttp + ${okhttp.version} + + + + org.apache.commons + commons-compress + ${common.compress.version} + + + org.apache.commons + commons-csv + ${common.csv.version} + + + org.apache.poi + poi-ooxml + ${apache.poi.version} + + + + org.json + json + 20180813 + + + + org.json4s + json4s-jackson_${scala.binary.version} + ${json4s.version} + + + + com.github.victools + jsonschema-generator + ${jsonschemagenerator.version} + + + + org.apache.commons + commons-text + ${common.text.version} + + + + com.opencsv + opencsv + 5.5 + + + io.github.classgraph + classgraph + 4.8.71 + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + provided + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + provided + + + + org.apache.commons + commons-math3 + 3.6.1 + + + + com.google.code.gson + gson + ${google.gson.version} + + + + commons-collections + commons-collections + ${commons.collections.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + + org.reflections + reflections + ${reflections.version} + + + + org.scala-lang + scala-library + ${scala.version} + + + + com.ibm.icu + icu4j + 70.1 + + + + org.javassist + javassist + ${javassist.version} + + + + + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + org.apache.maven.plugins + maven-plugin-plugin + 3.3 + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.9.1 + + ${dhp.site.skip} + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven.compiler.plugin.version} + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M4 + + true + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.2.0 + + true + none + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.6.0 + + + + net.revelc.code.formatter + formatter-maven-plugin + 2.11.0 + + + eu.dnetlib.dhp + dhp-code-style + ${project.version} + + + + + org.antipathy + mvn-scalafmt_${scala.binary.version} + 1.0.1640073709.733712b + + + eu.dnetlib.dhp + dhp-code-style + ${project.version} + + + + + + + + org.apache.maven.plugins + maven-site-plugin + + + org.apache.maven.plugins + maven-project-info-reports-plugin + + + net.revelc.code.formatter + formatter-maven-plugin + + + + format + + + eclipse/formatter_dnet.xml + + + + + + net.revelc.code + impsort-maven-plugin + 1.4.1 + + java.,javax.,org.,com. + java,* + + **/thrift/*.java + + + + + sort-imports + + sort + + + + + + org.antipathy + mvn-scalafmt_${scala.binary.version} + + + https://code-repo.d4science.org/D-Net/dnet-hadoop/raw/branch/beta/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf + + false + false + + ${project.basedir}/src/main/scala + + + ${project.basedir}/src/test/scala + + false + false + : git rev-parse --abbrev-ref HEAD + false + + + + validate + + format + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + org.jacoco + jacoco-maven-plugin + 0.8.10 + + + **/schemas/* + **/com/cloudera/**/* + **/org/apache/avro/io/**/* + + + + + default-prepare-agent + + prepare-agent + + + + default-report + prepare-package + + report + + + + + + + + + + org.apache.maven.wagon + wagon-ssh + 2.10 + + + + + + dnet45-snapshots + DNet45 Snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots + default + + + dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + + + DHPSite + ${dhp.site.stage.path}/ + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + true + none + + + + + + + sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop + UTF-8 + UTF-8 + 1.8 + 1.8 + + + 2.11.12 + 2.11 + + + 3.6.0 + 2.22.2 + 2.0.1 + 4.0.1 + + + dhp-schemas + + 4.1.2 + [2.6.1] + 1.20 + 1.8 + 1.8 + 1.9.4 + 1.9 + 3.2.1 + 2.4 + 1.1.3 + 1.7 + 1.0.7 + [3.17.1] + cdh5.9.2 + 3.5 + 11.0.2 + 2.6.0-${dhp.cdh.version} + 2.9.6 + 4.1.0-${dhp.cdh.version} + true + 2.4.0.cloudera2 + [4.0.3] + [6.0.5] + [3.1.6] + 2.2.2 + 3.19.0-GA + 3.5.3 + 4.13.0 + 5.6.1 + 3.3.3 + 3.4.2 + 4.7.2 + 4.5.3 + 1.7.25 + 0.9.10 + 1.3.0 + 7.5.0 + 3.6.0 + 0.0.7 + [2.12,3.0) + + + + + + scala-2.12 + + 2.12 + 2.12.18 + 1.3.0 + + + 4.8.1 + + + 1.22 + 1.8 + 1.10.0 + 1.9.4 + 1.15 + 3.2.2 + 2.11.0 + 1.1.3 + 1.7 + + 14.0.1 + 8.11.0 4.0.4 - 3.4.2-SNAPSHOT - 2.14.2 - 3.12.0 - 3.7.0-M11 - 3.25.0-GA - 4.10.0 - 2.0.6 - 0.10.2 - - - + 3.4.2.openaire-SNAPSHOT + 2.14.2 + 3.12.0 + 3.7.0-M11 + 3.25.0-GA + 4.10.0 + 2.0.6 + 0.10.2 + + + - - java17 - - 17 - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M4 - - - --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED - true - - - - - - - + + java17 + + 17 + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M4 + + + --add-opens=java.base/java.lang=ALL-UNNAMED + --add-opens=java.base/java.lang.invoke=ALL-UNNAMED + --add-opens=java.base/java.lang.reflect=ALL-UNNAMED + --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED + --add-opens=java.base/java.nio=ALL-UNNAMED + --add-opens=java.base/java.util=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent=ALL-UNNAMED + --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED + --add-opens=java.base/sun.nio.cs=ALL-UNNAMED + --add-opens=java.base/sun.security.action=ALL-UNNAMED + --add-opens=java.base/sun.util.calendar=ALL-UNNAMED + + true + + + + + + + From 52495f2cd2f7acc4b5e8ba0e6bc9b99e27a3ade4 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 18 Sep 2023 13:58:22 +0200 Subject: [PATCH 04/20] used javax.xml.stream.XMLEventReader instead of deprecated scala.xml.pull.XMLEventReader --- .../ebi/SparkCreateBaselineDataFrame.scala | 17 ++++++++-------- .../dnetlib/dhp/sx/bio/pubmed/PMParser.scala | 3 ++- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 20 +++++++++---------- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 8ac8b00bf..6f5b7110f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} +import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.commons.io.IOUtils @@ -14,13 +14,13 @@ import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.HttpClientBuilder import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD -import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql._ +import org.apache.spark.sql.expressions.Aggregator import org.slf4j.{Logger, LoggerFactory} -import java.io.InputStream -import scala.io.Source -import scala.xml.pull.XMLEventReader +import java.io.{ByteArrayInputStream, InputStream} +import java.nio.charset.Charset +import javax.xml.stream.XMLInputFactory object SparkCreateBaselineDataFrame { @@ -83,7 +83,7 @@ object SparkCreateBaselineDataFrame { if (response.getStatusLine.getStatusCode > 400) { tries -= 1 } else - return IOUtils.toString(response.getEntity.getContent) + return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset()) } catch { case e: Throwable => println(s"Error on requesting ${r.getURI}") @@ -155,7 +155,7 @@ object SparkCreateBaselineDataFrame { IOUtils.toString( SparkEBILinksToOaf.getClass.getResourceAsStream( "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" - ) + ),Charset.defaultCharset() ) ) parser.parseArgument(args) @@ -194,10 +194,11 @@ object SparkCreateBaselineDataFrame { if (!"true".equalsIgnoreCase(skipUpdate)) { downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) + val inputFactory = XMLInputFactory.newInstance val ds: Dataset[PMArticle] = spark.createDataset( k.filter(i => i._1.endsWith(".gz")) .flatMap(i => { - val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) + val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes())) new PMParser(xml) }) ) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index 9102c12c4..fb941a461 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -1,7 +1,8 @@ package eu.dnetlib.dhp.sx.bio.pubmed import scala.xml.MetaData -import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} +import javax.xml.stream.XMLEventReader +import scala.xml.pull.{EvElemEnd, EvElemStart, EvText} /** @param xml */ diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index d1611300d..c4af14c40 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension import java.io.{BufferedReader, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream +import javax.xml.stream.XMLInputFactory import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.io.Source @@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest { @Test def testEBIData() = { - val inputXML = Source - .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) - .mkString - val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) + val inputFactory = XMLInputFactory.newInstance + val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s))) } @@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest { @Test def testParsingPubmedXML(): Unit = { - val xml = new XMLEventReader( - Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) - ) + val inputFactory = XMLInputFactory.newInstance + + val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) + val parser = new PMParser(xml) parser.foreach(checkPMArticle) } @@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest { @Test def testPubmedMapping(): Unit = { - val xml = new XMLEventReader( - Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) - ) + val inputFactory = XMLInputFactory.newInstance + val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")) + val parser = new PMParser(xml) val results = ListBuffer[Oaf]() parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) From 613ec5ffceebb11740fc7ec29a406cbf7490ac14 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Thu, 21 Sep 2023 14:23:37 +0200 Subject: [PATCH 05/20] Add profiles for different spark versions: spark-24, spark-34, spark-35 --- .../eu/dnetlib/dhp/common/PacePerson.java | 2 +- dhp-pace-core/pom.xml | 88 ++++++++++++++++- .../eu/dnetlib/pace/model/SparkModel.scala | 9 +- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 1 - .../dnetlib/pace/util/SparkCompatUtils.scala | 12 +++ .../dnetlib/pace/util/SparkCompatUtils.scala | 12 +++ .../ebi/SparkCreateBaselineDataFrame.scala | 5 +- .../createunresolvedentities/ProduceTest.java | 5 +- .../opencitations/ReadCOCITest.java | 4 +- dhp-workflows/dhp-graph-provision/pom.xml | 42 ++++++++- .../dnetlib/dhp/swh/PrepareSWHActionsets.java | 3 +- pom.xml | 94 ++++++++++++++++--- 12 files changed, 245 insertions(+), 32 deletions(-) create mode 100644 dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala create mode 100644 dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java index fac9a7565..fbf586f8c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java @@ -38,7 +38,7 @@ public class PacePerson { PacePerson.class .getResourceAsStream( "/eu/dnetlib/dhp/common/name_particles.txt"))); - } catch (IOException e) { + } catch (Exception e) { throw new RuntimeException(e); } } diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index a6d2538f2..6449b7ec8 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -24,7 +24,7 @@ scala-compile-first - initialize + process-resources add-source compile @@ -95,4 +95,90 @@ + + + spark-24 + + true + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/spark-2 + + + + + + + + + + + spark-34 + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/spark-2 + + + + + + + + + + + spark-35 + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/spark-35 + + + + + + + + + + diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index aa997c6e9..63322738f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -2,11 +2,10 @@ package eu.dnetlib.pace.model import com.jayway.jsonpath.{Configuration, JsonPath} import eu.dnetlib.pace.config.{DedupConfig, Type} -import eu.dnetlib.pace.util.MapDocumentUtil -import org.apache.spark.sql.catalyst.encoders.RowEncoder +import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} +import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} -import org.apache.spark.sql.{Dataset, Row} import java.util.regex.Pattern import scala.collection.JavaConverters._ @@ -48,8 +47,8 @@ case class SparkModel(conf: DedupConfig) { val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) - val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { - df.map(r => rowFromJson(r))(RowEncoder(schema)) + val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { + df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) } def rowFromJson(json: String): Row = { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index 154bac62c..ac37c5e5a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -18,7 +18,6 @@ package eu.dnetlib.pace.util; * See the License for the specific language governing permissions and * limitations under the License. */ - /* * Diff Match and Patch * Copyright 2018 The diff-match-patch Authors. diff --git a/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala new file mode 100644 index 000000000..a426703d6 --- /dev/null +++ b/dhp-pace-core/src/main/spark-2/eu/dnetlib/pace/util/SparkCompatUtils.scala @@ -0,0 +1,12 @@ +package eu.dnetlib.pace.util + +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.types.StructType + +object SparkCompatUtils { + + def encoderFor(schema: StructType): ExpressionEncoder[Row] = { + RowEncoder(schema) + } +} \ No newline at end of file diff --git a/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala new file mode 100644 index 000000000..cbc454ae2 --- /dev/null +++ b/dhp-pace-core/src/main/spark-35/eu/dnetlib/pace/util/SparkCompatUtils.scala @@ -0,0 +1,12 @@ +package eu.dnetlib.pace.util + +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.StructType + +object SparkCompatUtils { + + def encoderFor(schema: StructType): ExpressionEncoder[Row] = { + ExpressionEncoder(schema) + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 6f5b7110f..11d087583 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -155,7 +155,8 @@ object SparkCreateBaselineDataFrame { IOUtils.toString( SparkEBILinksToOaf.getClass.getResourceAsStream( "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" - ),Charset.defaultCharset() + ), + Charset.defaultCharset() ) ) parser.parseArgument(args) @@ -198,7 +199,7 @@ object SparkCreateBaselineDataFrame { val ds: Dataset[PMArticle] = spark.createDataset( k.filter(i => i._1.endsWith(".gz")) .flatMap(i => { - val xml =inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes())) + val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes())) new PMParser(xml) }) ) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java index ce116688a..0a4dfc00b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java @@ -15,10 +15,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java index 3b416caf2..ebde0ed0c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCITest.java @@ -119,7 +119,9 @@ public class ReadCOCITest { workingDir.toString() + "/COCI", "-outputPath", workingDir.toString() + "/COCI_json/", - "-inputFile", "input1;input2;input3;input4;input5" + "-inputFile", "input1;input2;input3;input4;input5", + "-format", + "COCI" }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 60c925227..4b4e6c1c4 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -162,6 +162,18 @@ antlr4-runtime org.antlr + + woodstox-core + com.fasterxml.woodstox + + + log4j + * + + + org.apache.logging.log4j + * + @@ -210,7 +222,7 @@ - scala-2.11 + spark-24 true @@ -240,7 +252,7 @@ - scala-2.12 + spark-34 @@ -266,6 +278,32 @@ + + spark-35 + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.4.0 + + + generate-sources + + add-source + + + + src/main/sparksolr-4 + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java index 2691d4b7e..230a077f7 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java @@ -17,6 +17,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; @@ -117,7 +118,7 @@ public class PrepareSWHActionsets { .map( (MapFunction) t -> OBJECT_MAPPER.readValue(t, Software.class), Encoders.bean(Software.class)) - .filter(t -> t.getCodeRepositoryUrl() != null) + .filter((FilterFunction) t -> t.getCodeRepositoryUrl() != null) .select(col("id"), col("codeRepositoryUrl.value").as("repoUrl")); } diff --git a/pom.xml b/pom.xml index 1480af2a6..8c6bcd3d1 100644 --- a/pom.xml +++ b/pom.xml @@ -174,7 +174,7 @@ eu.dnetlib.dhp - ${dhp-schemas.artifact} + dhp-schemas ${dhp-schemas.version} @@ -233,6 +233,13 @@ provided + + org.slf4j + slf4j-log4j12 + ${org.slf4j.version} + provided + + org.slf4j jcl-over-slf4j @@ -240,6 +247,28 @@ provided + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j.version} + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + org.apache.logging.log4j + log4j-1.2-api + ${log4j.version} + + org.apache.commons commons-lang3 @@ -381,7 +410,7 @@ org.apache.zookeeper zookeeper - 3.4.11 + ${zookeeper.version} @@ -713,6 +742,7 @@ 3.0.0-M4 true + false @@ -782,7 +812,7 @@ net.revelc.code impsort-maven-plugin - 1.4.1 + 1.6.2 java.,javax.,org.,com. java,* @@ -918,8 +948,6 @@ 4.0.1 - dhp-schemas - 4.1.2 [2.6.1] 1.20 @@ -932,7 +960,7 @@ 1.1.3 1.7 1.0.7 - [3.17.1] + 4.17.2 cdh5.9.2 3.5 11.0.2 @@ -945,6 +973,7 @@ [6.0.5] [3.1.6] 2.2.2 + 1.2.17 3.19.0-GA 3.5.3 4.13.0 @@ -960,12 +989,13 @@ 3.6.0 0.0.7 [2.12,3.0) + 3.4.6 - scala-2.12 + spark-34 2.12 2.12.18 @@ -988,25 +1018,60 @@ 14.0.1 8.11.0 4.0.4 - 3.4.2.openaire-SNAPSHOT + 3.4.2.openaire 2.14.2 3.12.0 + 2.19.0 3.7.0-M11 3.25.0-GA 4.10.0 2.0.6 0.10.2 - + 3.6.3 - java17 + spark-35 + + 2.12 + 2.12.18 + 1.3.0 + + + 4.8.1 + + + 1.23.0 + 1.8 + 1.10.0 + 1.9.4 + 1.16.0 + 3.2.2 + 2.13.0 + 1.1.3 + 1.7 + + 14.0.1 + 8.11.0 + 4.0.4 + 3.5.1.openaire-SNAPSHOT + 2.15.2 + 3.12.0 + 2.20.0 + 3.7.0-M11 + 3.25.0-GA + 4.10.0 + 2.0.7 + 0.10.2 + 3.6.3 + + + + + java11 - 17 + [11 @@ -1031,6 +1096,7 @@ --add-opens=java.base/sun.util.calendar=ALL-UNNAMED true + false From 342cb6189bbbfe44dfae772fc5308f419a6f8d09 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 19 Apr 2024 12:13:26 +0200 Subject: [PATCH 06/20] fixed problem on changed signature on RowEncoder removed property dhp.schema.artifact --- dhp-common/pom.xml | 2 +- .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index d64e7e7a0..7c99ed527 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -164,7 +164,7 @@ eu.dnetlib.dhp - ${dhp-schemas.artifact} + dhp-schemas diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index cb1c70059..bade4869f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; +import eu.dnetlib.pace.util.SparkCompatUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -147,7 +148,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { StructType idsSchema = StructType .fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>"); - Dataset allIds = spark.emptyDataset(RowEncoder.apply(idsSchema)); + Dataset allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema)); for (EntityType entityType : ModelSupport.entityTypes.keySet()) { String entityPath = graphBasePath + '/' + entityType.name(); From 8dd9cf84e2ccbeec1db1d91193e813b35555bfb1 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 19 Apr 2024 12:30:59 +0200 Subject: [PATCH 07/20] code formatted --- .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index bade4869f..c5cb299b1 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; -import eu.dnetlib.pace.util.SparkCompatUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.util.SparkCompatUtils; import scala.Tuple2; import scala.Tuple3; From 073f320c6a2735bda5d51e7bd7766f01f791651d Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 22 Apr 2024 11:32:31 +0200 Subject: [PATCH 08/20] Added module containing all the dependencies, useful for spark deploy on k8. --- .../eu/dnetlib/pace/model/SparkModel.scala | 4 +- dhp-shade-package/pom.xml | 169 ++++++++++++++++++ .../dhp/oa/dedup/SparkCreateMergeRels.java | 4 +- .../dhp/oa/dedup/SparkPropagateRelation.java | 1 - pom.xml | 1 + 5 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 dhp-shade-package/pom.xml diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index aa04188da..e6a1c4ccc 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -3,7 +3,7 @@ package eu.dnetlib.pace.model import com.jayway.jsonpath.{Configuration, JsonPath} import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.config.{DedupConfig, Type} -import eu.dnetlib.pace.util.MapDocumentUtil +import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema @@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) { val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { - df.map(r => rowFromJson(r))(RowEncoder(schema)) + df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) } def rowFromJson(json: String): Row = { diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml new file mode 100644 index 000000000..128a57116 --- /dev/null +++ b/dhp-shade-package/pom.xml @@ -0,0 +1,169 @@ + + + 4.0.0 + + eu.dnetlib.dhp + dhp + 1.2.5-SNAPSHOT + ../pom.xml + + + + dhp-shade-package + jar + + + + DHPSite + ${dhp.site.stage.path}/dhp-common + + + + This module create a jar of all module dependencies + + + + + + eu.dnetlib.dhp + dhp-actionmanager + ${project.version} + + + eu.dnetlib.dhp + dhp-aggregation + ${project.version} + + + eu.dnetlib.dhp + dhp-blacklist + ${project.version} + + + eu.dnetlib.dhp + dhp-broker-events + ${project.version} + + + eu.dnetlib.dhp + dhp-dedup-openaire + ${project.version} + + + eu.dnetlib.dhp + dhp-enrichment + ${project.version} + + + eu.dnetlib.dhp + dhp-graph-mapper + ${project.version} + + + eu.dnetlib.dhp + dhp-graph-provision + ${project.version} + + + eu.dnetlib.dhp + dhp-impact-indicators + ${project.version} + + + eu.dnetlib.dhp + dhp-stats-actionsets + ${project.version} + + + eu.dnetlib.dhp + dhp-stats-hist-snaps + ${project.version} + + + eu.dnetlib.dhp + dhp-stats-monitor-irish + ${project.version} + + + eu.dnetlib.dhp + dhp-stats-promote + ${project.version} + + + eu.dnetlib.dhp + dhp-stats-update + ${project.version} + + + eu.dnetlib.dhp + dhp-swh + ${project.version} + + + eu.dnetlib.dhp + dhp-usage-raw-data-update + ${project.version} + + + eu.dnetlib.dhp + dhp-usage-stats-build + ${project.version} + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + + + eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels + + + + + META-INF/cxf/bus-extensions.txt + + + + + *:* + + META-INF/maven/** + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + com + repackaged.com.google.common + + com.google.common.** + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 59626c141..d48351c48 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.SparkCompatUtils; import scala.Tuple3; import scala.collection.JavaConversions; @@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction { Dataset pivotHistory = spark .createDataset( Collections.emptyList(), - RowEncoder - .apply(StructType.fromDDL("id STRING, lastUsage STRING"))); + SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING"))); if (StringUtils.isNotBlank(pivotHistoryDatabase)) { pivotHistory = spark diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index c64fbe4a4..c7efce4d7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -8,7 +8,6 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.sql.*; -import org.apache.spark.sql.catalyst.encoders.RowEncoder; import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/pom.xml b/pom.xml index 06e4ba9d4..fc68a666d 100644 --- a/pom.xml +++ b/pom.xml @@ -23,6 +23,7 @@ dhp-pace-core dhp-common dhp-workflows + dhp-shade-package From 9cd3bc0f10cc8104cd1dcde539f577ea1a3f3df9 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 26 Apr 2024 16:02:07 +0200 Subject: [PATCH 09/20] Added a new generation of the dump for scholexplorer tested with last version of spark, and strongly refactored --- .../scholexplorer/relation/relations.json | 8 + .../dhp/sx/graph/scholix/ScholixUtils.scala | 19 +- .../dhp/sx/create_scholix_dump_params.json | 5 + .../eu/dnetlib/dhp/sx/relation/relations.json | 166 ++++++++++++ .../dhp/sx/graph/ScholexplorerUtils.scala | 256 ++++++++++++++++++ .../graph/SparkCreateScholexplorerDump.scala | 130 +++++++++ .../graph/scholix/ScholixGenerationTest.scala | 17 ++ pom.xml | 2 +- 8 files changed, 597 insertions(+), 6 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala diff --git a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json index 98e8daa18..4f0cee53d 100644 --- a/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json +++ b/dhp-common/src/main/resources/eu/dnetlib/scholexplorer/relation/relations.json @@ -154,5 +154,13 @@ "unknown":{ "original":"Unknown", "inverse":"Unknown" + }, + "isamongtopnsimilardocuments": { + "original": "IsAmongTopNSimilarDocuments", + "inverse": "HasAmongTopNSimilarDocuments" + }, + "hasamongtopnsimilardocuments": { + "original": "HasAmongTopNSimilarDocuments", + "inverse": "IsAmongTopNSimilarDocuments" } } \ No newline at end of file diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index a995016a8..f256ca1a1 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -65,7 +65,11 @@ object ScholixUtils extends Serializable { } def generateScholixResourceFromResult(r: Result): ScholixResource = { - generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) + val sum = ScholixUtils.resultToSummary(r) + if (sum != null) + generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) + else + null } val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = @@ -153,6 +157,14 @@ object ScholixUtils extends Serializable { } + def invRel(rel: String): String = { + val semanticRelation = relations.getOrElse(rel.toLowerCase, null) + if (semanticRelation != null) + semanticRelation.inverse + else + null + } + def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => @@ -377,10 +389,7 @@ object ScholixUtils extends Serializable { if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - if (r.isInstanceOf[Publication]) - s.setTypology(Typology.publication) - else - s.setTypology(Typology.dataset) + s.setTypology(r.getResulttype.getClassid) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json new file mode 100644 index 000000000..fead58ab1 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json new file mode 100644 index 000000000..4f0cee53d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/relation/relations.json @@ -0,0 +1,166 @@ +{ + "cites":{ + "original":"Cites", + "inverse":"IsCitedBy" + }, + "compiles":{ + "original":"Compiles", + "inverse":"IsCompiledBy" + }, + "continues":{ + "original":"Continues", + "inverse":"IsContinuedBy" + }, + "derives":{ + "original":"IsSourceOf", + "inverse":"IsDerivedFrom" + }, + "describes":{ + "original":"Describes", + "inverse":"IsDescribedBy" + }, + "documents":{ + "original":"Documents", + "inverse":"IsDocumentedBy" + }, + "hasmetadata":{ + "original":"HasMetadata", + "inverse":"IsMetadataOf" + }, + "hasassociationwith":{ + "original":"HasAssociationWith", + "inverse":"HasAssociationWith" + }, + "haspart":{ + "original":"HasPart", + "inverse":"IsPartOf" + }, + "hasversion":{ + "original":"HasVersion", + "inverse":"IsVersionOf" + }, + "iscitedby":{ + "original":"IsCitedBy", + "inverse":"Cites" + }, + "iscompiledby":{ + "original":"IsCompiledBy", + "inverse":"Compiles" + }, + "iscontinuedby":{ + "original":"IsContinuedBy", + "inverse":"Continues" + }, + "isderivedfrom":{ + "original":"IsDerivedFrom", + "inverse":"IsSourceOf" + }, + "isdescribedby":{ + "original":"IsDescribedBy", + "inverse":"Describes" + }, + "isdocumentedby":{ + "original":"IsDocumentedBy", + "inverse":"Documents" + }, + "isidenticalto":{ + "original":"IsIdenticalTo", + "inverse":"IsIdenticalTo" + }, + "ismetadatafor":{ + "original":"IsMetadataFor", + "inverse":"IsMetadataOf" + }, + "ismetadataof":{ + "original":"IsMetadataOf", + "inverse":"IsMetadataFor" + }, + "isnewversionof":{ + "original":"IsNewVersionOf", + "inverse":"IsPreviousVersionOf" + }, + "isobsoletedby":{ + "original":"IsObsoletedBy", + "inverse":"Obsoletes" + }, + "isoriginalformof":{ + "original":"IsOriginalFormOf", + "inverse":"IsVariantFormOf" + }, + "ispartof":{ + "original":"IsPartOf", + "inverse":"HasPart" + }, + "ispreviousversionof":{ + "original":"IsPreviousVersionOf", + "inverse":"IsNewVersionOf" + }, + "isreferencedby":{ + "original":"IsReferencedBy", + "inverse":"References" + }, + "isrelatedto":{ + "original":"IsRelatedTo", + "inverse":"IsRelatedTo" + }, + "isrequiredby":{ + "original":"IsRequiredBy", + "inverse":"Requires" + }, + "isreviewedby":{ + "original":"IsReviewedBy", + "inverse":"Reviews" + }, + "issourceof":{ + "original":"IsSourceOf", + "inverse":"IsDerivedFrom" + }, + "issupplementedby":{ + "original":"IsSupplementedBy", + "inverse":"IsSupplementTo" + }, + "issupplementto":{ + "original":"IsSupplementTo", + "inverse":"IsSupplementedBy" + }, + "isvariantformof":{ + "original":"IsVariantFormOf", + "inverse":"IsOriginalFormOf" + }, + "isversionof":{ + "original":"IsVersionOf", + "inverse":"HasVersion" + }, + "obsoletes":{ + "original":"Obsoletes", + "inverse":"IsObsoletedBy" + }, + "references":{ + "original":"References", + "inverse":"IsReferencedBy" + }, + "requires":{ + "original":"Requires", + "inverse":"IsRequiredBy" + }, + "related":{ + "original":"IsRelatedTo", + "inverse":"IsRelatedTo" + }, + "reviews":{ + "original":"Reviews", + "inverse":"IsReviewedBy" + }, + "unknown":{ + "original":"Unknown", + "inverse":"Unknown" + }, + "isamongtopnsimilardocuments": { + "original": "IsAmongTopNSimilarDocuments", + "inverse": "HasAmongTopNSimilarDocuments" + }, + "hasamongtopnsimilardocuments": { + "original": "HasAmongTopNSimilarDocuments", + "inverse": "IsAmongTopNSimilarDocuments" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala new file mode 100644 index 000000000..95564d523 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala @@ -0,0 +1,256 @@ +package eu.dnetlib.dhp.sx.graph + +import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.sx.scholix.{ + Scholix, + ScholixCollectedFrom, + ScholixEntityId, + ScholixIdentifier, + ScholixRelationship, + ScholixResource +} +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse + +import scala.collection.JavaConverters._ +import scala.io.Source + +case class RelationInfo( + source: String, + target: String, + relclass: String, + id: String, + collectedfrom: Seq[RelKeyValue] +) {} +case class RelKeyValue(key: String, value: String) {} + +object ScholexplorerUtils { + + val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier" + + case class RelationVocabulary(original: String, inverse: String) {} + + val relations: Map[String, RelationVocabulary] = { + val input = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json") + ) + .mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + + lazy val json: json4s.JValue = parse(input) + + json.extract[Map[String, RelationVocabulary]] + } + + def invRel(rel: String): String = { + val semanticRelation = relations.getOrElse(rel.toLowerCase, null) + if (semanticRelation != null) + semanticRelation.inverse + else + null + } + + def generateDatasourceOpenAIREURLS(id: String): String = { + if (id != null && id.length > 12) + s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}" + else + null + } + + def findURLForPID( + pidValue: List[StructuredProperty], + urls: List[String] + ): List[(StructuredProperty, String)] = { + pidValue.map { p => + val pv = p.getValue + + val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase)) + (p, r.orNull) + } + } + + def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { + if (r.getInstance() == null || r.getInstance().isEmpty) + return List() + r.getInstance() + .asScala + .filter(i => i.getUrl != null && !i.getUrl.isEmpty) + .filter(i => i.getPid != null && i.getUrl != null) + .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) + .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)) + .distinct + .toList + } + + def generateScholixResourceFromResult(result: Result): ScholixResource = { + + if (result.getInstance() == null || result.getInstance().size() == 0) + return null + + if (result.getPid == null || result.getPid.isEmpty) + return null + + val r = new ScholixResource + r.setDnetIdentifier(result.getId) + + val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result) + if (persistentIdentifiers.isEmpty) + return null + + r.setIdentifier(persistentIdentifiers.asJava) + + r.setObjectType(result.getResulttype.getClassid) + + r.setObjectSubType( + result + .getInstance() + .asScala + .filter(i => i != null && i.getInstancetype != null) + .map(i => i.getInstancetype.getClassname) + .distinct + .head + ) + + if (result.getTitle != null && result.getTitle.asScala.nonEmpty) { + val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList + if (titles.nonEmpty) + r.setTitle(titles.head) + else + return null + } + if (result.getAuthor != null && !result.getAuthor.isEmpty) { + val authors: List[ScholixEntityId] = + result.getAuthor.asScala + .map(a => { + val entity = new ScholixEntityId() + entity.setName(a.getFullname) + if (a.getPid != null && a.getPid.size() > 0) + entity.setIdentifiers( + a.getPid.asScala + .map(sp => { + val id = new ScholixIdentifier() + id.setIdentifier(sp.getValue) + id.setSchema(sp.getQualifier.getClassid) + id + }) + .take(3) + .toList + .asJava + ) + entity + }) + .toList + if (authors.nonEmpty) + r.setCreator(authors.asJava) + + } + + val dt: List[String] = result + .getInstance() + .asScala + .filter(i => i.getDateofacceptance != null) + .map(i => i.getDateofacceptance.getValue) + .toList + if (dt.nonEmpty) + r.setPublicationDate(dt.distinct.head) + + r.setPublisher( + result + .getInstance() + .asScala + .map(i => i.getHostedby) + .filter(h => !"unknown".equalsIgnoreCase(h.getValue)) + .map(h => { + val eid = new ScholixEntityId() + eid.setName(h.getValue) + val id = new ScholixIdentifier() + id.setIdentifier(h.getKey) + id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA) + id.setUrl(generateDatasourceOpenAIREURLS(h.getKey)) + eid.setIdentifiers(List(id).asJava) + eid + }) + .distinct + .asJava + ) + + r.setCollectedFrom( + result.getCollectedfrom.asScala + .map(cf => { + val scf = new ScholixCollectedFrom() + scf.setProvisionMode("collected") + scf.setCompletionStatus("complete") + val eid = new ScholixEntityId() + eid.setName(cf.getValue) + val id = new ScholixIdentifier() + id.setIdentifier(cf.getKey) + id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA) + id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey)) + eid.setIdentifiers(List(id).asJava) + scf.setProvider(eid) + scf + }) + .asJava + ) + + r + } + + def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = { + val s: Scholix = new Scholix + s.setSource(source) + if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty) + s.setLinkprovider( + relation.collectedfrom + .map(cf => { + val eid = new ScholixEntityId() + eid.setName(cf.value) + val id = new ScholixIdentifier() + id.setIdentifier(cf.key) + id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA) + id.setUrl(generateDatasourceOpenAIREURLS(cf.key)) + eid.setIdentifiers(List(id).asJava) + eid + }) + .toList + .asJava + ) + else { + val eid = new ScholixEntityId() + eid.setName("OpenAIRE") + val id = new ScholixIdentifier() + id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556") + id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA) + id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier)) + eid.setIdentifiers(List(id).asJava) + s.setLinkprovider(List(eid).asJava) + } + s.setIdentifier(relation.id) + val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null) + if (semanticRelation == null) + return null + s.setRelationship( + new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse) + ) + s.setPublicationDate(source.getPublicationDate) + s.setPublisher(source.getPublisher) + val mockTarget = new ScholixResource + mockTarget.setDnetIdentifier(relation.target) + s.setTarget(mockTarget) + s + } + + def updateTarget(s: Scholix, t: ScholixResource): Scholix = { + + s.setTarget(t) + val spublishers: Seq[ScholixEntityId] = + if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List() + val tpublishers: Seq[ScholixEntityId] = + if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List() + val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList + s.setPublisher(mergedPublishers.asJava) + s + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala new file mode 100644 index 000000000..9334fc6e0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala @@ -0,0 +1,130 @@ +package eu.dnetlib.dhp.sx.graph + +import eu.dnetlib.dhp.application.AbstractScalaApplication +import eu.dnetlib.dhp.schema.oaf.{ + KeyValue, + OtherResearchProduct, + Publication, + Relation, + Result, + Software, + Dataset => OafDataset +} +import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource} +import org.apache.spark.sql.functions.{col, concat, expr, md5} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + val sourcePath = parser.get("sourcePath") + log.info("sourcePath: {}", sourcePath) + val targetPath = parser.get("targetPath") + log.info("targetPath: {}", targetPath) + generateBidirectionalRelations(sourcePath, targetPath, spark) + generateScholixResource(sourcePath, targetPath, spark) + generateScholix(targetPath, spark) + } + + def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = { + val entityMap: Map[String, StructType] = Map( + "publication" -> Encoders.bean(classOf[Publication]).schema, + "dataset" -> Encoders.bean(classOf[OafDataset]).schema, + "software" -> Encoders.bean(classOf[Software]).schema, + "otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema + ) + + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource]) + implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result]) + + val resDs = spark.emptyDataset[ScholixResource] + val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => { + println(s"adding ${item._1}") + res.union( + spark.read + .schema(item._2) + .json(s"$inputPath/${item._1}") + .as[Result] + .map(r => ScholexplorerUtils.generateScholixResourceFromResult(r)) + .filter(s => s != null) + ) + }) + scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource") + } + + def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = { + val relSchema = Encoders.bean(classOf[Relation]).schema + + val relDF = spark.read + .schema(relSchema) + .json(s"$inputPath/relation") + .where( + "datainfo.deletedbyinference is false and source like '50%' and target like '50%' " + + "and relClass <> 'merges' and relClass <> 'isMergedIn'" + ) + .select("source", "target", "collectedfrom", "relClass") + + def invRel: String => String = { s => + ScholexplorerUtils.invRel(s) + } + + import org.apache.spark.sql.functions.udf + val inverseRelationUDF = udf(invRel) + val inverseRelation = relDF.select( + col("target").alias("source"), + col("source").alias("target"), + col("collectedfrom"), + inverseRelationUDF(col("relClass")).alias("relClass") + ) + + val bidRel = inverseRelation + .union(relDF) + .withColumn("id", md5(concat(col("source"), col("relClass"), col("target")))) + .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))")) + .drop("collectedfrom") + .withColumnRenamed("cf", "collectedfrom") + .distinct() + + bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation") + + } + + def generateScholix(outputPath: String, spark: SparkSession): Unit = { + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource]) + implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix]) + + import spark.implicits._ + val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo] + val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource] + + val scholix_one_verse = relations + .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner") + .map(res => ScholexplorerUtils.generateScholix(res._1, res._2)) + + scholix_one_verse + .joinWith(resource, scholix_one_verse("target.dnetIdentifier") === resource("dnetIdentifier"), "inner") + .map(k => ScholexplorerUtils.updateTarget(k._1, k._2)) + .write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(s"$outputPath/scholix") + } +} + +object SparkCreateScholexplorerDump { + val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass) + + def main(args: Array[String]): Unit = { + new SparkCreateScholexplorerDump( + log = logger, + args = args, + propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json" + ).initialize().run() + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala new file mode 100644 index 000000000..0a2872cb4 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala @@ -0,0 +1,17 @@ +package eu.dnetlib.dhp.sx.graph.scholix + +import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump +import org.apache.spark.sql.SparkSession +import org.junit.jupiter.api.Test + +class ScholixGenerationTest { + + @Test + def generateScholix(): Unit = { + val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate() + val app = new SparkCreateScholexplorerDump(null, null, null) +// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) +// app.generateBidirectionalRelations("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) + app.generateScholix("/home/sandro/Downloads/scholix/", spark) + } +} diff --git a/pom.xml b/pom.xml index d3db1d3d4..9f6f1f2a9 100644 --- a/pom.xml +++ b/pom.xml @@ -960,7 +960,7 @@ 1.1.3 1.7 1.0.7 - [6.1.1] + [6.1.2-SNAPSHOT] cdh5.9.2 3.5 11.0.2 From 052c6aac9d2dd96d37d75120890aa4dc4647a19b Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 26 Apr 2024 16:03:04 +0200 Subject: [PATCH 10/20] formatted code --- .../dhp/collection/crossref/Crossref2Oaf.scala | 1 + .../dnetlib/dhp/collection/crossref/issn_pub.json | 4 ---- .../collection/crossref/CrossrefMappingTest.scala | 13 ++++++++++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index 44c82e256..c4aa64fd4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -1025,6 +1025,7 @@ case object Crossref2Oaf { tp._1 match { case "electronic" => journal.setIssnOnline(tp._2) case "print" => journal.setIssnPrinted(tp._2) + case _ => } }) } diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json index 2a9e391df..2f1af2a6e 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/issn_pub.json @@ -789,10 +789,6 @@ "value": "2227-9717", "type": "electronic" }, - { - "value": "VALUE", - "type": "PIPPO" - }, { "value": "1063-4584", "type": "pu" diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala index ed43bb1a1..c3ea884eb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala @@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest -import org.junit.jupiter.api.BeforeEach +import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType +import org.apache.commons.io.IOUtils +import org.junit.jupiter.api.{BeforeEach, Test} import org.junit.jupiter.api.extension.ExtendWith import org.mockito.junit.jupiter.MockitoExtension import org.slf4j.{Logger, LoggerFactory} @@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest { super.setUpVocabulary() } + @Test + def mappingRecord(): Unit = { + val input = + IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8") + + println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All)) + + } + } From 133ead1e3ef86be422783eddf9fd3e46738b6e02 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 29 Apr 2024 09:00:30 +0200 Subject: [PATCH 11/20] updated new version of scholexplorer Generation --- .../dhp/sx/graph/SparkCreateScholexplorerDump.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala index 9334fc6e0..1211dcc78 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala @@ -107,9 +107,13 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner") .map(res => ScholexplorerUtils.generateScholix(res._1, res._2)) + val resourceTarget = relations + .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner") + .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource]))) + scholix_one_verse - .joinWith(resource, scholix_one_verse("target.dnetIdentifier") === resource("dnetIdentifier"), "inner") - .map(k => ScholexplorerUtils.updateTarget(k._1, k._2)) + .joinWith(resourceTarget, scholix_one_verse("identifier") === resourceTarget("_1"), "inner") + .map(k => ScholexplorerUtils.updateTarget(k._1, k._2._2)) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") From 0646d0d0645341020ee12c284e0872e6e450cc11 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 2 May 2024 15:15:03 +0200 Subject: [PATCH 12/20] Updated main sparkApplication to avoid to require master variable --- .../eu/dnetlib/dhp/application/SparkScalaApplication.scala | 7 ++++--- .../eu/dnetlib/dhp/sx/create_scholix_dump_params.json | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala index a14c25837..526bbd295 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala @@ -65,12 +65,13 @@ abstract class AbstractScalaApplication( val conf: SparkConf = new SparkConf() val master = parser.get("master") log.info(s"Creating Spark session: Master: $master") - SparkSession + val b = SparkSession .builder() .config(conf) .appName(getClass.getSimpleName) - .master(master) - .getOrCreate() + if (master != null) + b.master(master) + b.getOrCreate() } def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json index fead58ab1..53fe95895 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/create_scholix_dump_params.json @@ -1,5 +1,5 @@ [ - {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true} ] \ No newline at end of file From a860c57bbc2c6ae788c91c103873dc942e7ff473 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 2 May 2024 15:16:00 +0200 Subject: [PATCH 13/20] updated .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 14cd4d345..6fafc7055 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ spark-warehouse /**/.factorypath /**/.scalafmt.conf /.java-version +/dhp-shade-package/dependency-reduced-pom.xml From db358ad0d2ffb63cd7215ec89e693274982b78e1 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 2 May 2024 15:25:57 +0200 Subject: [PATCH 14/20] code formatted --- .../eu/dnetlib/pace/common/PaceCommonUtils.java | 15 ++++++++------- .../main/java/eu/dnetlib/pace/model/Person.java | 11 ++++++----- .../java/eu/dnetlib/pace/util/Capitalise.java | 3 ++- .../pace/common/AbstractPaceFunctions.java | 13 +++++++------ 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java index a279271b5..61fbc2470 100644 --- a/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/pace/common/PaceCommonUtils.java @@ -1,19 +1,20 @@ package eu.dnetlib.pace.common; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import com.ibm.icu.text.Transliterator; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; - import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; + /** * Set of common functions for the framework * diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java index c95c9d823..6a1957183 100644 --- a/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dhp-common/src/main/java/eu/dnetlib/pace/model/Person.java @@ -1,20 +1,21 @@ package eu.dnetlib.pace.model; +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; + import eu.dnetlib.pace.common.PaceCommonUtils; import eu.dnetlib.pace.util.Capitalise; import eu.dnetlib.pace.util.DotAbbreviations; -import java.nio.charset.Charset; -import java.text.Normalizer; -import java.util.List; -import java.util.Set; - public class Person { private static final String UTF8 = "UTF-8"; diff --git a/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java index 015386423..671320c71 100644 --- a/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java +++ b/dhp-common/src/main/java/eu/dnetlib/pace/util/Capitalise.java @@ -1,9 +1,10 @@ package eu.dnetlib.pace.util; -import com.google.common.base.Function; import org.apache.commons.lang3.text.WordUtils; +import com.google.common.base.Function; + public class Capitalise implements Function { private final char[] DELIM = { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 6bfb8b3f4..b055077d8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -1,12 +1,6 @@ package eu.dnetlib.pace.common; -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; -import com.ibm.icu.text.Transliterator; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; - import java.io.IOException; import java.io.StringWriter; import java.nio.charset.StandardCharsets; @@ -15,6 +9,13 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; + /** * Set of common functions for the framework * From 6efab4d88e7ce481896e5569e1801daf81c96777 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 16 May 2024 16:19:18 +0200 Subject: [PATCH 15/20] fixed scholexplorer bug --- .../dhp/sx/graph/scholix/ScholixUtils.scala | 2 +- dhp-shade-package/pom.xml | 150 +++++++++--------- .../dhp/sx/graph/ScholexplorerUtils.scala | 15 +- .../graph/SparkCreateScholexplorerDump.scala | 23 ++- .../graph/scholix/ScholixGenerationTest.scala | 17 +- 5 files changed, 112 insertions(+), 95 deletions(-) diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index f256ca1a1..72a17777e 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -389,7 +389,7 @@ object ScholixUtils extends Serializable { if (persistentIdentifiers.isEmpty) return null s.setLocalIdentifier(persistentIdentifiers.asJava) - s.setTypology(r.getResulttype.getClassid) +// s.setTypology(r.getResulttype.getClassid) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml index 128a57116..fd9c04066 100644 --- a/dhp-shade-package/pom.xml +++ b/dhp-shade-package/pom.xml @@ -31,86 +31,86 @@ dhp-actionmanager ${project.version} - - eu.dnetlib.dhp - dhp-aggregation - ${project.version} - - - eu.dnetlib.dhp - dhp-blacklist - ${project.version} - - - eu.dnetlib.dhp - dhp-broker-events - ${project.version} - - - eu.dnetlib.dhp - dhp-dedup-openaire - ${project.version} - - - eu.dnetlib.dhp - dhp-enrichment - ${project.version} - + + + + + + + + + + + + + + + + + + + + + + + + + eu.dnetlib.dhp dhp-graph-mapper ${project.version} - - eu.dnetlib.dhp - dhp-graph-provision - ${project.version} - - - eu.dnetlib.dhp - dhp-impact-indicators - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-actionsets - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-hist-snaps - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-monitor-irish - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-promote - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-update - ${project.version} - - - eu.dnetlib.dhp - dhp-swh - ${project.version} - - - eu.dnetlib.dhp - dhp-usage-raw-data-update - ${project.version} - - - eu.dnetlib.dhp - dhp-usage-stats-build - ${project.version} - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala index 95564d523..f62f271e3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala @@ -1,14 +1,8 @@ package eu.dnetlib.dhp.sx.graph +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty} -import eu.dnetlib.dhp.schema.sx.scholix.{ - Scholix, - ScholixCollectedFrom, - ScholixEntityId, - ScholixIdentifier, - ScholixRelationship, - ScholixResource -} +import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -28,6 +22,7 @@ case class RelKeyValue(key: String, value: String) {} object ScholexplorerUtils { val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier" + val mapper= new ObjectMapper() case class RelationVocabulary(original: String, inverse: String) {} @@ -242,7 +237,7 @@ object ScholexplorerUtils { s } - def updateTarget(s: Scholix, t: ScholixResource): Scholix = { + def updateTarget(s: Scholix, t: ScholixResource): String = { s.setTarget(t) val spublishers: Seq[ScholixEntityId] = @@ -251,6 +246,6 @@ object ScholexplorerUtils { if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List() val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList s.setPublisher(mergedPublishers.asJava) - s + mapper.writeValueAsString(s) } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala index 1211dcc78..32aa68665 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala @@ -11,7 +11,7 @@ import eu.dnetlib.dhp.schema.oaf.{ Dataset => OafDataset } import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource} -import org.apache.spark.sql.functions.{col, concat, expr, md5} +import org.apache.spark.sql.functions.{col, concat, expr, first, md5} import org.apache.spark.sql.types.StructType import org.apache.spark.sql._ import org.slf4j.{Logger, LoggerFactory} @@ -89,7 +89,13 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo .withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))")) .drop("collectedfrom") .withColumnRenamed("cf", "collectedfrom") - .distinct() + .groupBy(col("id")) + .agg( + first("source").alias("source"), + first("target").alias("target"), + first("relClass").alias("relClass"), + first("collectedfrom").alias("collectedfrom") + ) bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation") @@ -97,27 +103,32 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo def generateScholix(outputPath: String, spark: SparkSession): Unit = { implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource]) - implicit val scholixEncoder: Encoder[Scholix] = Encoders.bean(classOf[Scholix]) + implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix]) import spark.implicits._ val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo] val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource] + + val scholix_one_verse = relations .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner") .map(res => ScholexplorerUtils.generateScholix(res._1, res._2)) + .map(s=> (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix]))) + val resourceTarget = relations .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner") .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource]))) + scholix_one_verse - .joinWith(resourceTarget, scholix_one_verse("identifier") === resourceTarget("_1"), "inner") - .map(k => ScholexplorerUtils.updateTarget(k._1, k._2._2)) + .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner") + .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2)) .write .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(s"$outputPath/scholix") + .text(s"$outputPath/scholix") } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala index 0a2872cb4..67d40dcf1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala @@ -1,17 +1,28 @@ package eu.dnetlib.dhp.sx.graph.scholix +import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump -import org.apache.spark.sql.SparkSession +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.junit.jupiter.api.Test +import org.objenesis.strategy.StdInstantiatorStrategy class ScholixGenerationTest { @Test def generateScholix(): Unit = { + val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate() val app = new SparkCreateScholexplorerDump(null, null, null) -// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) -// app.generateBidirectionalRelations("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) +// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark) +// app.generateBidirectionalRelations( +// "/home/sandro/Downloads/scholix_sample/", +// "/home/sandro/Downloads/scholix/", +// spark +// ) app.generateScholix("/home/sandro/Downloads/scholix/", spark) + + + } } From a87f9ea64317dff7afac5045a4c64bb9c8a26954 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 17 May 2024 14:16:43 +0200 Subject: [PATCH 16/20] fixed scholexplorer bug --- .../eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala | 11 +++++++++-- .../dhp/sx/graph/SparkCreateScholexplorerDump.scala | 6 +----- .../dhp/sx/graph/scholix/ScholixGenerationTest.scala | 2 -- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala index f62f271e3..d171d96d9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/ScholexplorerUtils.scala @@ -2,7 +2,14 @@ package eu.dnetlib.dhp.sx.graph import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty} -import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixCollectedFrom, ScholixEntityId, ScholixIdentifier, ScholixRelationship, ScholixResource} +import eu.dnetlib.dhp.schema.sx.scholix.{ + Scholix, + ScholixCollectedFrom, + ScholixEntityId, + ScholixIdentifier, + ScholixRelationship, + ScholixResource +} import org.json4s import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse @@ -22,7 +29,7 @@ case class RelKeyValue(key: String, value: String) {} object ScholexplorerUtils { val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier" - val mapper= new ObjectMapper() + val mapper = new ObjectMapper() case class RelationVocabulary(original: String, inverse: String) {} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala index 32aa68665..dd420ab95 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateScholexplorerDump.scala @@ -109,19 +109,15 @@ class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], lo val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo] val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource] - - val scholix_one_verse = relations .joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner") .map(res => ScholexplorerUtils.generateScholix(res._1, res._2)) - .map(s=> (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix]))) - + .map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix]))) val resourceTarget = relations .joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner") .map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource]))) - scholix_one_verse .joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner") .map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2)) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala index 67d40dcf1..204fe9794 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGenerationTest.scala @@ -22,7 +22,5 @@ class ScholixGenerationTest { // ) app.generateScholix("/home/sandro/Downloads/scholix/", spark) - - } } From c7b32bbacc43b89a652fa9cbf8c2982150b64b36 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 23 May 2024 13:00:19 +0300 Subject: [PATCH 17/20] Update CopyDataToImpalaCluster: Update the code of acquiring the entities from Ocean cluster, through hive, in order to optimize the process and account for additional reserved keywords in Impala. Co-authored-by: Antonis Lempesis --- .../oozie_app/copyDataToImpalaCluster.sh | 26 +++---------------- .../oozie_app/copyDataToImpalaCluster.sh | 26 +++---------------- .../oozie_app/copyDataToImpalaCluster.sh | 26 +++---------------- .../oozie_app/copyDataToImpalaCluster.sh | 25 +++--------------- 4 files changed, 16 insertions(+), 87 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index 059fb9089..f0ea50cbd 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -39,23 +39,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' - -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' - function copydb() { db=$1 @@ -109,17 +95,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 1130a684d..8d32e11fb 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -38,23 +38,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' - -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' - function copydb() { db=$1 @@ -108,17 +94,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index de275145b..ece71a634 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -38,23 +38,9 @@ IMPALA_CONFIG_FILE='/etc/impala_cluster/hdfs-site.xml' IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" - # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' - -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' - function copydb() { db=$1 @@ -108,17 +94,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 6fc0aa745..109f9111c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -40,19 +40,6 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" # Set sed arguments. LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) -# Set the SED command arguments for column-names with reserved words: -DATE_SED_ARG_1='s/[[:space:]]\date[[:space:]]/\`date\`/g' -DATE_SED_ARG_2='s/\.date,/\.\`date\`,/g' # the "date" may be part of a larger field name like "datestamp" or "date_aggregated", so we need to be careful with what we are replacing. -DATE_SED_ARG_3='s/\.date[[:space:]]/\.\`date\` /g' - -HASH_SED_ARG_1='s/[[:space:]]\hash[[:space:]]/\`hash\`/g' -HASH_SED_ARG_2='s/\.hash,/\.\`hash\`,/g' -HASH_SED_ARG_3='s/\.hash[[:space:]]/\.\`hash\` /g' - -LOCATION_SED_ARG_1='s/[[:space:]]\location[[:space:]]/\`location\`/g' -LOCATION_SED_ARG_2='s/\.location,/\.\`location\`,/g' -LOCATION_SED_ARG_3='s/\.location[[:space:]]/\.\`location\` /g' - export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" @@ -110,17 +97,13 @@ function copydb() { num_tables=0 entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. - for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. + for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elements are single-words. # Check if this is a view by showing the create-statement where it should print "create view" for a view, not the "create table". Unfortunately, there is no "show views" command. - create_entity_statement=`hive -e "show create table ${db}.${i};"` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. - - create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` + create_entity_statement=`hive --database ${db} -e "show create table ${i};"` # We need to use the "--database", instead of including it inside the query, in order to return the statements with the '`' chars being in the right place to be used by impala-shell. However, we need to add the db-name in the "CREATE VIEW view_name" statement. + create_view_statement_test=`echo -e "$create_entity_statement" | grep 'CREATE VIEW'` # It needs to happen in two stages, otherwise the "grep" is not able to match multi-line statement. if [ -n "$create_view_statement_test" ]; then echo -e "\n'${i}' is a view, so we will save its 'create view' statement and execute it on Impala, after all tables have been created.\n" - create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/\`//g' \ - | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "${DATE_SED_ARG_1}" | sed "${HASH_SED_ARG_1}" | sed "${LOCATION_SED_ARG_1}" \ - | sed "${DATE_SED_ARG_2}" | sed "${HASH_SED_ARG_2}" | sed "${LOCATION_SED_ARG_2}" \ - | sed "${DATE_SED_ARG_3}" | sed "${HASH_SED_ARG_3}" | sed "${LOCATION_SED_ARG_3}"` + create_view_statement=`echo -e "$create_entity_statement" | sed 's/WARN:.*//g' | sed 's/"$/;/' | sed 's/^"//' | sed 's/\\"\\"/\"/g' | sed -e "${LOCATION_HDFS_NODE_SED_ARG}" | sed "s/CREATE VIEW /CREATE VIEW ${db}./"` all_create_view_statements+=("$create_view_statement") else echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" From 68322843e2a1fd352ac372838a1da99d2bcb0a44 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 23 May 2024 15:07:49 +0300 Subject: [PATCH 18/20] Small updates to the copy-operation to Impala Cluster: - Add a configuration-"switch" to control whether the script exits upon an error or not. - Allow the script to exit when a table could not be created. - Show the elapsed time for processing each database. --- .../oozie_app/copyDataToImpalaCluster.sh | 44 ++++++++++++++--- .../oozie_app/copyDataToImpalaCluster.sh | 46 +++++++++++++++--- .../oozie_app/copyDataToImpalaCluster.sh | 45 +++++++++++++++--- .../oozie_app/copyDataToImpalaCluster.sh | 47 +++++++++++++++---- 4 files changed, 153 insertions(+), 29 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index f0ea50cbd..f829cecc1 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,7 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' @@ -30,7 +31,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -43,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} + + function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -53,7 +69,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -77,7 +95,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -109,12 +129,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -158,7 +183,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -186,11 +213,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 8d32e11fb..0af44a2cc 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,9 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 + + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" @@ -29,7 +32,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -42,8 +47,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} + + function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -52,7 +70,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -76,7 +96,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -108,12 +130,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -157,7 +184,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -185,11 +214,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index ece71a634..46d495578 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -8,6 +8,8 @@ fi export HADOOP_USER_NAME=$2 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' echo -e "\nOCEAN HDFS virtual-name which resolves automatically to the active-node: ${OCEAN_HDFS_NODE}" @@ -29,7 +31,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -42,8 +46,21 @@ IMPALA_HDFS_DB_BASE_PATH="${IMPALA_HDFS_NODE}/user/hive/warehouse" LOCATION_HDFS_NODE_SED_ARG="s|${OCEAN_HDFS_NODE}|${IMPALA_HDFS_NODE}|g" # This requires to be used with "sed -e" in order to have the "|" delimiter (as the "/" conflicts with the URIs) +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} + + function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -52,7 +69,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -76,7 +95,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -108,12 +129,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -157,7 +183,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -185,11 +213,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index 109f9111c..cd9019746 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,6 +6,8 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' @@ -28,7 +30,9 @@ while [ $COUNTER -lt 3 ]; do done if [ -z "$IMPALA_HDFS_NODE" ]; then echo -e "\n\nERROR: PROBLEM WHEN SETTING THE HDFS-NODE FOR IMPALA CLUSTER! | AFTER ${COUNTER} RETRIES.\n\n" - exit 1 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 1 + fi fi echo -e "Active IMPALA HDFS Node: ${IMPALA_HDFS_NODE} , after ${COUNTER} retries.\n\n" @@ -45,8 +49,21 @@ export HADOOP_USER_NAME=$6 export PROD_USAGE_STATS_DB="openaire_prod_usage_stats" +function print_elapsed_time() +{ + start_time=$1 + end_time=$(date +%s) + elapsed_time=$(($end_time-$start_time)) + hours=$((elapsed_time / 3600)) + minutes=$(((elapsed_time % 3600) / 60)) + seconds=$((elapsed_time % 60)) + printf "\nElapsed time: %02d:%02d:%02d\n\n" $hours $minutes $seconds +} + + function copydb() { db=$1 + start_db_time=$(date +%s) echo -e "\nStart processing db: '${db}'..\n" # Delete the old DB from Impala cluster (if exists). @@ -55,7 +72,9 @@ function copydb() { if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" rm -f error.log - exit 2 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 2 + fi fi echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" @@ -79,7 +98,9 @@ function copydb() { else echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n" rm -f error.log - exit 3 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 3 + fi fi # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. @@ -111,12 +132,17 @@ function copydb() { CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" - exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check. + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 4 + fi else impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` if [ -n "$log_errors" ]; then echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 5 + fi fi fi fi @@ -160,7 +186,9 @@ function copydb() { if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" - exit 5 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 6 + fi elif [[ $new_num_of_views_to_retry -gt 0 ]]; then echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" else @@ -188,11 +216,14 @@ function copydb() { else echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" rm -f error.log - exit 6 + if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then + exit 7 + fi fi rm -f error.log - echo -e "\n\nFinished processing db: ${db}\n\n" + echo -e "\n\nFinished processing db: ${db}\n" + print_elapsed_time start_db_time } STATS_DB=$1 @@ -216,6 +247,6 @@ copydb $MONITOR_DB'_ris_tail' contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" for i in ${contexts} do - tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` + tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'` copydb ${MONITOR_DB}'_'${tmp} done \ No newline at end of file From b48ed6e617038aaaf9677f7eb9143ab3a464f82d Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 23 May 2024 16:58:12 +0300 Subject: [PATCH 19/20] Change configuration in the copy-operation to Impala Cluster: Set the "SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR" parameter to "false". --- .../stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh | 3 ++- .../stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh | 2 +- .../graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh | 3 ++- .../dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh index f829cecc1..26760d650 100644 --- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh @@ -8,7 +8,8 @@ fi export HADOOP_USER_NAME=$2 -SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' diff --git a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh index 0af44a2cc..26760d650 100644 --- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh @@ -8,7 +8,7 @@ fi export HADOOP_USER_NAME=$2 -SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 # Set the active HDFS node of OCEAN and IMPALA cluster. diff --git a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh index 46d495578..1ab3e417a 100644 --- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh @@ -8,7 +8,8 @@ fi export HADOOP_USER_NAME=$2 -SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 + # Set the active HDFS node of OCEAN and IMPALA cluster. OCEAN_HDFS_NODE='hdfs://nameservice1' diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh index cd9019746..7957a659c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh @@ -6,7 +6,7 @@ then ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} fi -SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=1 +SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR=0 # Set the active HDFS node of OCEAN and IMPALA cluster. From 15b54a345aa329fe0e256aa1e8c84050d30308f2 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 24 May 2024 13:21:28 +0300 Subject: [PATCH 20/20] added fos lvl4 --- .../dhp/oa/graph/stats/oozie_app/scripts/step7.sql | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index eb16a161e..c0993ef0b 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -129,11 +129,14 @@ create table ${stats_db_name}.result_fos stored as parquet as with lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'), lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'), - lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification') -select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3 + lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification'), + lvl4 as (select id, topic from ${stats_db_name}.result_topics where topic like '________ %' and type='Fields of Science and Technology classification') +select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3, lvl4.topic as lvl4 from lvl1 join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2) - join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4); + join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4) + join lvl4 on lvl4.id=lvl1.id and substr(lvl4.topic, 1, 6)=substr(lvl3.topic, 1, 6); + DROP TABLE IF EXISTS ${stats_db_name}.result_organization purge;