From 5968ec018db3e9292d8be92552199483791d4ed5 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Jul 2022 16:48:38 +0200 Subject: [PATCH] [Clean Country] modified workflow and added param file --- .../country/GetDatasourceFromCountry.java | 2 +- .../dhp/oa/graph/clean/oozie_app/workflow.xml | 175 +++++++++++++++++- .../graph/input_clean_country_parameters.json | 6 + .../input_datasource_country_parameters.json | 26 +++ 4 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java index cdca961074..c255c445fb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/country/GetDatasourceFromCountry.java @@ -38,7 +38,7 @@ public class GetDatasourceFromCountry implements Serializable { .toString( GetDatasourceFromCountry.class .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json")); + "/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 0cf6cdd05e..2ba0a7ad7d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -14,8 +14,8 @@ the address of the lookUp service - shouldCleanContext - true if the context have to be cleaned + shouldClean + true if the operation of deletion of not needed values from the results have to be performed contextId @@ -30,6 +30,22 @@ It is the constrint to be verified. This time is hardcoded as gcube and it is searched for in the title. If title starts with gcube than the context sobigdata will be removed by the result if present + + verifyCountryParam + 10.17632;10.5061 + It is the constraints to be verified. This time is hardcoded as the starting doi from mendeley and dryad and it is searched for in + the pid value. If the pid value starts with one of the two prefixes, then the country may be removed + + + country + NL + It is the country to be removed from the set of countries if it is present with provenance propagation. The country will not be removed if in one of the isntances there is a datasource with country `country` + + + collectedfrom + NARCIS + the only datasource for which the country NL will be removed from the country list + sparkDriverMemory @@ -296,7 +312,7 @@ - ${wf:conf('shouldCleanContext') eq true} + ${wf:conf('shouldClean') eq true} @@ -416,7 +432,158 @@ - + + + + + + yarn + cluster + Clean publications context + eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath} + --workingPath${workingDir}/working/hostedby + --country${country} + + + + + + + + + + + + + + + yarn + cluster + Clean publications counmtry + eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/publication + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication + --workingPath${workingDir}/working/publication + --country${country} + --verifyParam${verifyCountryParam} + --datasourcePath${workingDir}/working/hostedby + --collectedfrom${collectedfrom} + + + + + + + + yarn + cluster + Clean datasets Country + eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/dataset + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset + --workingPath${workingDir}/working/dataset + --country${country} + --verifyParam${verifyCountryParam} + --datasourcePath${workingDir}/working/hostedby + --collectedfrom${collectedfrom} + + + + + + + + yarn + cluster + Clean otherresearchproducts country + eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/otherresearchproduct + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --workingPath${workingDir}/working/otherresearchproduct + --country${country} + --verifyParam${verifyCountryParam} + --datasourcePath${workingDir}/working/hostedby + --collectedfrom${collectedfrom} + + + + + + + + yarn + cluster + Clean softwares country + eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --inputPath${graphOutputPath}/software + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software + --workingPath${workingDir}/working/software + --country${country} + --verifyParam${verifyCountryParam} + --datasourcePath${workingDir}/working/hostedby + --collectedfrom${collectedfrom} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json index e46f7b6da2..318fb22f80 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json @@ -39,5 +39,11 @@ "paramLongName": "collectedfrom", "paramDescription": "the collectedfrom value for which we should apply the cleaning", "paramRequired": true + }, + { + "paramName": "hb", + "paramLongName": "hostedBy", + "paramDescription": "the set of datasources having the specified country in the graph searched for in the hostedby of the results", + "paramRequired": true } ] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json new file mode 100644 index 0000000000..e0aa60328d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_datasource_country_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the path to store the output graph", + "paramRequired": true + }, + { + "paramName": "c", + "paramLongName": "country", + "paramDescription": "the id of the context to be removed", + "paramRequired": true + } +]