From f53eaafc23fae01ff97e23b2e479bb7396fe6bd4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 9 Dec 2024 16:59:54 +0100 Subject: [PATCH] upgrading to Spark 3.4.2-openaire: removed spark parameters spark2ExtraListeners, spark2SqlQueryExecutionListeners, upgraded openaire-solr-importer to 1.0.2, dhp-schemas to 10.0.2, removed com.lucidworks.spark:spark-solr --- dhp-workflows/dhp-graph-provision/pom.xml | 98 +------------------ .../oa/provision/oozie_app/config-default.xml | 8 -- .../dhp/oa/provision/oozie_app/workflow.xml | 55 +---------- .../oa/oaipmh/IrishOaiExporterJobTest.java | 5 +- pom.xml | 43 ++------ 5 files changed, 17 insertions(+), 192 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml index 2eecf1b82..992fca8a5 100644 --- a/dhp-workflows/dhp-graph-provision/pom.xml +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -81,98 +81,6 @@ solr-solrj - - - - junit - junit - 4.12 - test - - - org.apache.solr - solr-test-framework - test - - - com.carrotsearch - * - - - com.carrotsearch.randomizedtesting - * - - - - com.fasterxml.jackson.core - * - - - com.fasterxml.jackson.dataformat - * - - - org.codehaus.jackson - * - - - org.codehaus.janino - * - - - org.codehaus.woodstox - * - - - com.github.ben-manes.caffeine - * - - - com.google.guava - * - - - com.google.protobuf - * - - - com.lmax - * - - - com.tdunning - * - - - org.apache.hadoop - * - - - org.apache.zookeeper - zookeeper - - - ant - org.apache.ant - - - antlr4-runtime - org.antlr - - - woodstox-core - com.fasterxml.woodstox - - - log4j - * - - - org.apache.logging.log4j - * - - - io.dropwizard.metrics metrics-core @@ -183,6 +91,7 @@ org.apache.httpcomponents httpclient + org.apache.httpcomponents httpmime @@ -191,13 +100,8 @@ org.elasticsearch elasticsearch-hadoop - - - org.noggit - noggit - org.apache.zookeeper zookeeper diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml index cb830bb1a..866aae76c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/config-default.xml @@ -27,14 +27,6 @@ spark2EventLogDir /user/spark/spark2ApplicationHistory - - spark2ExtraListeners - "com.cloudera.spark.lineage.NavigatorAppListener" - - - spark2SqlQueryExecutionListeners - "com.cloudera.spark.lineage.NavigatorQueryListener" - oozieActionShareLibForSpark2 spark342 diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 570005012..0127203e3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -92,16 +92,6 @@ oozieActionShareLibForSpark2 oozie action sharelib for spark 2.* - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - spark2YarnHistoryServerAddress spark 2.* yarn history server address @@ -158,8 +148,6 @@ --executor-memory=6G --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=6G - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 @@ -199,8 +187,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 @@ -227,8 +213,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 @@ -255,8 +239,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 @@ -283,8 +265,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -311,8 +291,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -339,8 +317,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -367,8 +343,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -395,8 +369,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -436,8 +408,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 @@ -465,8 +435,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 @@ -494,8 +462,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 @@ -523,8 +489,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -552,8 +516,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=8000 @@ -581,8 +543,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 @@ -610,8 +570,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -639,8 +597,6 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 @@ -670,8 +626,6 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} --conf spark.executor.memoryOverhead=${sparkExecutorMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 @@ -724,10 +678,12 @@ --executor-memory=${sparkExecutorMemoryForIndexing} --driver-memory=${sparkDriverMemoryForIndexing} + --conf spark.driver.memoryOverhead=${sparkDriverMemoryForIndexing} + --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForIndexing} --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.speculation=false @@ -739,6 +695,7 @@ --path${workingDir}/xml_json --collection${shadowFormat}-index-openaire --zkHost${zkHost} + --batchSize${batchSize} @@ -772,8 +729,6 @@ --executor-cores=${sparkExecutorCoresForJoining} --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java index c16f75e1d..368ad8c7d 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJobTest.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.oa.oaipmh; -import static org.junit.Assert.assertNull; +import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -17,6 +17,7 @@ import org.apache.commons.io.IOUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class IrishOaiExporterJobTest { @@ -63,7 +64,7 @@ public class IrishOaiExporterJobTest { @Test void testGzip_empty() { - assertNull(IrishOaiExporterJob.gzip("")); + Assertions.assertNull(IrishOaiExporterJob.gzip("")); assertNull(IrishOaiExporterJob.gzip(null)); } diff --git a/pom.xml b/pom.xml index e815dc5f8..f685f92c7 100644 --- a/pom.xml +++ b/pom.xml @@ -180,7 +180,7 @@ eu.dnetlib.dhp openaire-solr-importer - [1.0.0] + [1.0.2] org.apache.hadoop @@ -365,30 +365,8 @@ org.apache.solr solr-solrj ${solr.version} - - - * - * - - - - - com.lucidworks.spark - spark-solr - ${sparksolr.version} - - - * - * - - - - - org.apache.solr - solr-test-framework - ${solr.version} - test + io.dropwizard.metrics metrics-core @@ -396,22 +374,18 @@ test - org.apache.httpcomponents httpclient ${org.apache.httpcomponents.version} + org.apache.httpcomponents httpmime ${org.apache.httpcomponents.version} - - org.noggit - noggit - 0.8 - + org.apache.zookeeper zookeeper @@ -657,6 +631,7 @@ javassist ${javassist.version} + @@ -942,7 +917,7 @@ 1.1.3 1.7 1.0.7 - [10.0.0] + [10.0.1] cdh5.9.2 3.5 11.0.2 @@ -1015,8 +990,7 @@ 1.7 14.0.1 - 8.11.0 - 4.0.4 + 9.7.0 3.4.2.openaire 2.14.2 3.12.0 @@ -1052,8 +1026,7 @@ 1.7 14.0.1 - 8.11.0 - 4.0.4 + 9.7.0 3.5.1.openaire-SNAPSHOT 2.15.2 3.12.0