adjustments for country propagation

2023-12-22 11:27:30 +01:00 · 2023-12-22 11:27:30 +01:00 · 3afd4aa57b
parent 4740c808f7
commit 3afd4aa57b
8 changed files with 130 additions and 18 deletions
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java
@ -45,7 +45,7 @@ public class PrepareDatasourceCountryAssociation {
 			.toString(
 				PrepareDatasourceCountryAssociation.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
+						"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json"));

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

@ -90,7 +90,8 @@ public class PrepareDatasourceCountryAssociation {
 				(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
 					Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
 					Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
-					(allowedtypes.contains(ds.getJurisdiction().getClassid()) ||
+					((Optional.ofNullable(ds.getJurisdiction()).isPresent() &&
+						allowedtypes.contains(ds.getJurisdiction().getClassid())) ||
 						whitelist.contains(ds.getId())));

 		// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java
@ -32,7 +32,7 @@ public class PrepareResultCountrySet {
 			.toString(
 				PrepareResultCountrySet.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
+						"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json"));

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java
@ -35,7 +35,7 @@ public class SparkCountryPropagationJob {
 			.toString(
 				SparkCountryPropagationJob.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
+						"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json"));

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java
@ -60,7 +60,7 @@ public class PrepareInfo implements Serializable {
 			.toString(
 				SparkResultToOrganizationFromIstRepoJob.class
 					.getResourceAsStream(
-						"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
+						"/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json"));

 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json
@ -0,0 +1,32 @@
+[
+  {
+    "paramName":"s",
+    "paramLongName":"sourcePath",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName":"tn",
+    "paramLongName":"resultTableName",
+    "paramDescription": "the name of the result table we are currently working on",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  },
+  {
+    "paramName": "p",
+    "paramLongName": "preparedInfoPath",
+    "paramDescription": "the path where prepared info have been stored",
+    "paramRequired": false
+  },
+  {
+    "paramName": "ssm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json
@ -0,0 +1,32 @@
+[
+  {
+    "paramName":"s",
+    "paramLongName":"sourcePath",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "out",
+    "paramLongName": "outputPath",
+    "paramDescription": "the path used to store temporary output files",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "whitelist",
+    "paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway",
+    "paramRequired": true
+  },
+  {
+    "paramName": "at",
+    "paramLongName": "allowedtypes",
+    "paramDescription": "the allowed datasource types for country propagation",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ssm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json
@ -0,0 +1,38 @@
+[
+  {
+    "paramName":"s",
+    "paramLongName":"sourcePath",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName":"out",
+    "paramLongName":"outputPath",
+    "paramDescription": "the output path",
+    "paramRequired": true
+  },
+  {
+    "paramName":"w",
+    "paramLongName":"workingPath",
+    "paramDescription": "the working path",
+    "paramRequired": true
+  },
+  {
+    "paramName":"tn",
+    "paramLongName":"resultTableName",
+    "paramDescription": "the name of the result table we are currently working on",
+    "paramRequired": true
+  },
+  {
+    "paramName": "p",
+    "paramLongName": "preparedInfoPath",
+    "paramDescription": "the path where prepared info have been stored",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ssm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "true if the spark session is managed, false otherwise",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml
@ -61,7 +61,7 @@
            <arg>--sourcePath</arg><arg>${sourcePath}</arg>
            <arg>--whitelist</arg><arg>${whitelist}</arg>
            <arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="fork_prepare_result_country"/>
        <error to="Kill"/>
@ -95,8 +95,10 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/publication</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="wait_prepare"/>
        <error to="Kill"/>
@ -123,8 +125,10 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="wait_prepare"/>
        <error to="Kill"/>
@ -151,8 +155,10 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="wait_prepare"/>
        <error to="Kill"/>
@ -179,14 +185,16 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/software</arg>
+            <arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
        </spark>
        <ok to="wait_prepare"/>
        <error to="Kill"/>
    </action>

-    <join name="wait_prepare" to="fork_join_apply_country_propagation"/>
+    <join name="wait_prepare" to="fork_apply_country_propagation"/>

    <fork name="fork_apply_country_propagation">
        <path start="propagation_publication"/>
@ -216,9 +224,9 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
-
+            <arg>--outputPath</arg><arg>${workingDir}/country/publication</arg>
        </spark>
        <ok to="wait"/>
        <error to="Kill"/>
@ -245,9 +253,9 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
-
+            <arg>--outputPath</arg><arg>${workingDir}/country/dataset</arg>
        </spark>
        <ok to="wait"/>
        <error to="Kill"/>
@ -274,9 +282,9 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
-
+            <arg>--outputPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
        </spark>
        <ok to="wait"/>
        <error to="Kill"/>
@ -303,8 +311,9 @@
                --conf spark.sql.shuffle.partitions=3840
            </spark-opts>
            <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
-            <arg>--workingPath</arg><arg>${workingDir}/country</arg>
+            <arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
            <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
+            <arg>--outputPath</arg><arg>${workingDir}/country/software</arg>
                    </spark>
        <ok to="wait"/>
        <error to="Kill"/>