From 729671789d5d7e500fa5b20a20a0bc78a9e30789 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 12 Dec 2024 11:33:30 +0100 Subject: [PATCH] adjusting the workflow to make it work with Java17 and Spark 3.4.2-openaire --- .../oa/provision/SolrAdminApplication.java | 7 +- .../provision/input_solradmin_parameters.json | 6 - .../dhp/oa/provision/oozie_app/workflow.xml | 110 +++++++++++++++--- pom.xml | 2 +- 4 files changed, 99 insertions(+), 26 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java index 2bf7d3fbb..614ff96c3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java @@ -37,9 +37,6 @@ public class SolrAdminApplication implements Closeable { .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_solradmin_parameters.json"))); parser.parseArgument(args); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - final Action action = Action.valueOf(parser.get("action")); log.info("action: {}", action); @@ -52,9 +49,7 @@ public class SolrAdminApplication implements Closeable { .orElse(false); log.info("commit: {}", commit); - final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl)); - - final String zkHost = isLookup.getZkHost(); + final String zkHost = parser.get("zkHost"); log.info("zkHost: {}", zkHost); final String publicFormat = parser.get("publicFormat"); diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_solradmin_parameters.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_solradmin_parameters.json index 23a378857..92de01d66 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_solradmin_parameters.json +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/input_solradmin_parameters.json @@ -1,10 +1,4 @@ [ - { - "paramName": "isu", - "paramLongName": "isLookupUrl", - "paramDescription": "the URL to the ISLookUp Service", - "paramRequired": true - }, { "paramName": "a", "paramLongName": "action", diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 0127203e3..288447e7c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -39,8 +39,9 @@ maximum number of relations allowed for a each entity grouping by target - shadowFormat - metadata format name (DMF|TMF) + collection + shadow + collection name, indexing target batchSize @@ -104,6 +105,11 @@ sparkNetworkTimeout configures spark.network.timeout + + JAVA_HOME + /srv/java/openjdk-17 + Used to configure the Java home location + @@ -114,6 +120,10 @@ oozie.action.sharelib.for.spark ${oozieActionShareLibForSpark2} + + oozie.launcher.mapreduce.map.env + JAVA_HOME=${JAVA_HOME} + @@ -148,10 +158,14 @@ --executor-memory=6G --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=6G + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${inputGraphRootPath}/relation --outputPath${workingDir}/relation @@ -187,10 +201,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/publication @@ -213,10 +231,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/dataset @@ -239,10 +261,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/otherresearchproduct @@ -265,10 +291,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/software @@ -291,10 +321,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/datasource @@ -317,10 +351,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/organization @@ -343,10 +381,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/project @@ -369,10 +411,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputRelationsPath${workingDir}/relation --inputEntityPath${inputGraphRootPath}/person @@ -408,10 +454,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=15000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -435,10 +485,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/dataset --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -462,10 +516,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -489,10 +547,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software @@ -516,10 +578,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=8000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/datasource --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Datasource @@ -543,10 +609,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=10000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/organization --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Organization @@ -570,10 +640,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/project --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Project @@ -597,10 +671,14 @@ --executor-memory=${sparkExecutorMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining} --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=5000 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputEntityPath${inputGraphRootPath}/person --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person @@ -626,10 +704,14 @@ --executor-memory=${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} --conf spark.executor.memoryOverhead=${sparkExecutorMemory} + --conf spark.extraListeners= + --conf spark.sql.queryExecutionListeners= --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.shuffle.partitions=3840 --conf spark.network.timeout=${sparkNetworkTimeout} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputPath${workingDir}/join_entities --outputPath${workingDir}/xml_json @@ -658,9 +740,9 @@ eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} + --zkHost${zkHost} --actionDELETE_BY_QUERY - --shadowFormat${shadowFormat} + --shadowFormat${collection} --query${solrDeletionQuery} --committrue @@ -689,15 +771,15 @@ --conf spark.speculation=false --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --conf spark.executorEnv.JAVA_HOME=/srv/java/openjdk-17 - --conf spark.yarn.appMasterEnv.JAVA_HOME=/srv/java/openjdk-17 + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --path${workingDir}/xml_json - --collection${shadowFormat}-index-openaire + --collection${collection} --zkHost${zkHost} --batchSize${batchSize} - + @@ -710,8 +792,8 @@ eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} - --shadowFormat${shadowFormat} + --zkHost${zkHost} + --shadowFormat${collection} --actionCOMMIT @@ -731,10 +813,12 @@ --driver-memory=${sparkDriverMemoryForJoining} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.executorEnv.JAVA_HOME=${JAVA_HOME} + --conf spark.yarn.appMasterEnv.JAVA_HOME=${JAVA_HOME} --inputPath${workingDir}/xml_json - --isLookupUrl${isLookupUrl} - --shadowFormat${shadowFormat} + --zkHost${zkHost} + --shadowFormat${collection} --outputPath${workingDir}/solr_documents @@ -751,7 +835,7 @@ eu.dnetlib.dhp.oa.provision.SolrAdminApplication - --isLookupUrl${isLookupUrl} + --zkHost${zkHost} --actionUPDATE_ALIASES --publicFormat${publicFormat} --shadowFormat${shadowFormat} diff --git a/pom.xml b/pom.xml index f685f92c7..61e10c278 100644 --- a/pom.xml +++ b/pom.xml @@ -180,7 +180,7 @@ eu.dnetlib.dhp openaire-solr-importer - [1.0.2] + [1.0.4-SNAPSHOT] org.apache.hadoop