From 5e05385d35d54313ae337da6f1454889010feb39 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 5 Jun 2024 16:31:58 +0200 Subject: [PATCH 1/4] minor --- .../java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java index 57f180fa0..3d69370af 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/oaipmh/IrishOaiExporterJob.java @@ -31,7 +31,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.provision.XmlConverterJob; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.model.TupleWrapper; @@ -48,7 +47,7 @@ public class IrishOaiExporterJob { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - XmlConverterJob.class + IrishOaiExporterJob.class .getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json"))); parser.parseArgument(args); From efc1632e16e61322edd7d89c3924056ef126dc2a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 6 Jun 2024 09:25:26 +0200 Subject: [PATCH 2/4] code formatting --- .../oa/provision/utils/XmlSerializationUtils.java | 14 +++++++------- .../dhp/oa/provision/XmlRecordFactoryTest.java | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index 0b8093edd..398e81e8c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -176,13 +176,13 @@ public class XmlSerializationUtils { StringBuilder sb = new StringBuilder(); for (KeyValue kv : measure.getUnit()) { sb - .append("<") - .append(name) - .append(" ") - .append(attr("id", measure.getId())) - .append(attr("count", kv.getValue())) - .append(attr("datasource", kv.getKey())) - .append(" />"); + .append("<") + .append(name) + .append(" ") + .append(attr("id", measure.getId())) + .append(attr("count", kv.getValue())) + .append(attr("datasource", kv.getKey())) + .append(" />"); } return sb.toString(); } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 42a62b5d5..055156a32 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -43,7 +43,6 @@ public class XmlRecordFactoryTest { final Publication p = OBJECT_MAPPER .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); - final String xml = xmlRecordFactory.build(new JoinedEntity(p)); assertNotNull(xml); From 576efc1857fb1c96bd9979efa1cf6c8450a8f6bc Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 10 Jun 2024 15:22:33 +0200 Subject: [PATCH 3/4] hostedby patching to work with the updated Crossref contents --- .../hostedbymap/SparkApplyHostedByMapToResult.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala index 803f95fc1..db7edf53e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/oa/graph/hostedbymap/SparkApplyHostedByMapToResult.scala @@ -27,10 +27,12 @@ object SparkApplyHostedByMapToResult { val inst: Instance = i.head patchInstance(p, ei, inst) - } else if (i.size == 2) { - if (i.map(ii => ii.getCollectedfrom.getValue).contains("UnpayWall")) { - val inst: Instance = i.filter(ii => "Crossref".equals(ii.getCollectedfrom.getValue)).head - patchInstance(p, ei, inst) + } else { + val cf = i.map(ii => ii.getCollectedfrom.getValue) + if (cf.contains("Crossref")) { + i.foreach(ii => { + patchInstance(p, ei, ii) + }) } } } From d39a1054b8e7fbb0469f341509faf67d15f94899 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 10 Jun 2024 16:15:07 +0200 Subject: [PATCH 4/4] [actionset promotion] use sparkExecutorMemory to define also the memoryOverhead --- dhp-shade-package/dependency-reduced-pom.xml | 113 ++++++++++++++++++ .../wf/dataset/oozie_app/workflow.xml | 2 + .../wf/datasource/oozie_app/workflow.xml | 1 + .../wf/main/oozie_app/workflow.xml | 1 + .../wf/organization/oozie_app/workflow.xml | 1 + .../oozie_app/workflow.xml | 4 +- .../wf/project/oozie_app/workflow.xml | 1 + .../wf/publication/oozie_app/workflow.xml | 6 +- .../wf/relation/oozie_app/workflow.xml | 3 +- .../wf/software/oozie_app/workflow.xml | 4 +- 10 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 dhp-shade-package/dependency-reduced-pom.xml diff --git a/dhp-shade-package/dependency-reduced-pom.xml b/dhp-shade-package/dependency-reduced-pom.xml new file mode 100644 index 000000000..04843072f --- /dev/null +++ b/dhp-shade-package/dependency-reduced-pom.xml @@ -0,0 +1,113 @@ + + + + dhp + eu.dnetlib.dhp + 1.2.5-SNAPSHOT + + 4.0.0 + dhp-shade-package + This module create a jar of all module dependencies + + + + maven-shade-plugin + + + package + + shade + + + + + eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels + + + + META-INF/cxf/bus-extensions.txt + + + + + *:* + + META-INF/maven/** + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + com + repackaged.com.google.common + + com.google.common.** + + + + + + + + + + + + org.projectlombok + lombok + 1.18.28 + provided + + + org.junit.jupiter + junit-jupiter + 5.6.1 + test + + + junit-jupiter-api + org.junit.jupiter + + + junit-jupiter-params + org.junit.jupiter + + + junit-jupiter-engine + org.junit.jupiter + + + + + org.mockito + mockito-core + 3.3.3 + test + + + byte-buddy + net.bytebuddy + + + byte-buddy-agent + net.bytebuddy + + + + + org.mockito + mockito-junit-jupiter + 3.3.3 + test + + + + + DHPSite + ${dhp.site.stage.path}/dhp-common + + + diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml index 5401b45ca..b1bc1d6e1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -156,6 +157,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml index f9bd66ae3..9a84f4708 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/datasource/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml index 393f04e89..65ddd402b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml @@ -125,6 +125,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml index ebfdeee31..0d73b498d 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/organization/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml index 02399ed9b..ca8362c9b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -155,11 +156,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=8000 --inputGraphTablePath${workingDir}/otherresearchproduct --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml index 57c2357b4..37310da79 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/project/oozie_app/workflow.xml @@ -95,6 +95,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml index 92b114776..a4b0b237c 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml @@ -103,11 +103,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -156,11 +157,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${workingDir}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml index e9e5f0b45..f72847ba8 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml @@ -95,11 +95,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=10000 + --conf spark.sql.shuffle.partitions=15000 --inputGraphTablePath${inputGraphRootPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml index 1d36ddf94..dbe8a63c1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml @@ -103,6 +103,7 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} @@ -155,11 +156,12 @@ --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=4000 --inputGraphTablePath${workingDir}/software --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software