From 3afd4aa57bb107e35f71108c64c45ada698cf8a7 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 22 Dec 2023 11:27:30 +0100 Subject: [PATCH] adjustments for country propagation --- .../PrepareDatasourceCountryAssociation.java | 5 ++- .../PrepareResultCountrySet.java | 2 +- .../SparkCountryPropagationJob.java | 2 +- .../PrepareInfo.java | 2 +- .../input_countrypropagation_parameters.json | 32 ++++++++++++++++ .../input_prepareassoc_parameters.json | 32 ++++++++++++++++ ...input_prepareresultcountry_parameters.json | 38 +++++++++++++++++++ .../countrypropagation/oozie_app/workflow.xml | 35 ++++++++++------- 8 files changed, 130 insertions(+), 18 deletions(-) create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index 430c265924..a016509e57 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -45,7 +45,7 @@ public class PrepareDatasourceCountryAssociation { .toString( PrepareDatasourceCountryAssociation.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -90,7 +90,8 @@ public class PrepareDatasourceCountryAssociation { (FilterFunction) ds -> !ds.getDataInfo().getDeletedbyinference() && Optional.ofNullable(ds.getDatasourcetype()).isPresent() && Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() && - (allowedtypes.contains(ds.getJurisdiction().getClassid()) || + ((Optional.ofNullable(ds.getJurisdiction()).isPresent() && + allowedtypes.contains(ds.getJurisdiction().getClassid())) || whitelist.contains(ds.getId()))); // filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java index 184d24751b..884aa0e47e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java @@ -32,7 +32,7 @@ public class PrepareResultCountrySet { .toString( PrepareResultCountrySet.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 17247f8125..92930c18bd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -35,7 +35,7 @@ public class SparkCountryPropagationJob { .toString( SparkCountryPropagationJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 8d3432f062..bdfdde13bd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -60,7 +60,7 @@ public class PrepareInfo implements Serializable { .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json")); + "/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json new file mode 100644 index 0000000000..d3cde8b747 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json new file mode 100644 index 0000000000..a00105f2ba --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "whitelist", + "paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway", + "paramRequired": true + }, + { + "paramName": "at", + "paramLongName": "allowedtypes", + "paramDescription": "the allowed datasource types for country propagation", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json new file mode 100644 index 0000000000..18163d1f96 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"out", + "paramLongName":"outputPath", + "paramDescription": "the output path", + "paramRequired": true + }, + { + "paramName":"w", + "paramLongName":"workingPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "preparedInfoPath", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml index 933bab7e06..81d6dc3dc1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml @@ -61,7 +61,7 @@ --sourcePath${sourcePath} --whitelist${whitelist} --allowedtypes${allowedtypes} - --workingPath${workingDir}/country + --outputPath${workingDir}/preparedInfo @@ -95,8 +95,10 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/publication - --workingPath${workingDir}/country + --outputPath${workingDir}/publication + --workingPath${workingDir}/workingP --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --preparedInfoPath${workingDir}/preparedInfo @@ -123,8 +125,10 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/dataset - --workingPath${workingDir}/country + --outputPath${workingDir}/dataset + --workingPath${workingDir}/workingD --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --preparedInfoPath${workingDir}/preparedInfo @@ -151,8 +155,10 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/otherresearchproduct - --workingPath${workingDir}/country + --outputPath${workingDir}/otherresearchproduct + --workingPath${workingDir}/workingO --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --preparedInfoPath${workingDir}/preparedInfo @@ -179,14 +185,16 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/software - --workingPath${workingDir}/country + --outputPath${workingDir}/software + --workingPath${workingDir}/workingS --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --preparedInfoPath${workingDir}/preparedInfo - + @@ -216,9 +224,9 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/publication - --workingPath${workingDir}/country + --preparedInfoPath${workingDir}/publication --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - + --outputPath${workingDir}/country/publication @@ -245,9 +253,9 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/dataset - --workingPath${workingDir}/country + --preparedInfoPath${workingDir}/dataset --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - + --outputPath${workingDir}/country/dataset @@ -274,9 +282,9 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/otherresearchproduct - --workingPath${workingDir}/country + --preparedInfoPath${workingDir}/otherresearchproduct --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - + --outputPath${workingDir}/country/otherresearchproduct @@ -303,8 +311,9 @@ --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/software - --workingPath${workingDir}/country + --preparedInfoPath${workingDir}/software --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/country/software