From 0eb0701b26953d30729ee0aa6bb817da48a2da9f Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 1 Apr 2024 17:23:26 +0300 Subject: [PATCH 1/2] Add action set creation for Datacite affiliations --- .../bipaffiliations/PrepareAffiliationRelations.java | 9 +++++++++ .../bipaffiliations/input_actionset_parameter.json | 6 ++++++ .../dhp/actionmanager/bipaffiliations/job.properties | 2 ++ .../actionmanager/bipaffiliations/oozie_app/workflow.xml | 6 ++++++ .../bipaffiliations/PrepareAffiliationRelationsTest.java | 7 ++++--- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index f9a5c539f..2b50fcfba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -67,6 +67,9 @@ public class PrepareAffiliationRelations implements Serializable { final String openapcInputPath = parser.get("openapcInputPath"); log.info("openapcInputPath: {}", openapcInputPath); + final String dataciteInputPath = parser.get("dataciteInputPath"); + log.info("dataciteInputPath: {}", dataciteInputPath); + final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); @@ -93,9 +96,15 @@ public class PrepareAffiliationRelations implements Serializable { JavaPairRDD openAPCRelations = prepareAffiliationRelations( spark, openapcInputPath, collectedFromOpenAPC); + List collectedFromDatacite = OafMapperUtils + .listKeyValues(ModelConstants.DATACITE_ID, "Datacite"); + JavaPairRDD dataciteRelations = prepareAffiliationRelations( + spark, dataciteInputPath, collectedFromDatacite); + crossrefRelations .union(pubmedRelations) .union(openAPCRelations) + .union(dataciteRelations) .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index 5a91ace75..9671129f7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -23,6 +23,12 @@ "paramDescription": "the path to get the input data from OpenAPC", "paramRequired": true }, + { + "paramName": "dip", + "paramLongName": "dataciteInputPath", + "paramDescription": "the path to get the input data from Datacite", + "paramRequired": true + }, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index 69c667732..fe663b5e9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -34,4 +34,6 @@ oozie.wf.application.path=${oozieTopWfApplicationPath} crossrefInputPath=/data/bip-affiliations/crossref-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json +dataciteInputPath=/data/bip-affiliations/openapc-data.json + outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index e42437396..e8e6a7c33 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -13,6 +13,10 @@ openapcInputPath the path where to find the inferred affiliation relations from OpenAPC + + dataciteInputPath + the path where to find the inferred affiliation relations from Datacite + outputPath the path where to store the actionset @@ -107,6 +111,8 @@ --crossrefInputPath${crossrefInputPath} --pubmedInputPath${pubmedInputPath} --openapcInputPath${openapcInputPath} + --dataciteInputPath${dataciteInputPath} + --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index c164aad8b..bceb9d754 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -87,6 +87,7 @@ public class PrepareAffiliationRelationsTest { "-crossrefInputPath", crossrefAffiliationRelationPath, "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath, + "-dataciteInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); @@ -103,7 +104,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(60, tmp.count()); + assertEquals(80, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -114,7 +115,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 30, execVerification + 40, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -122,7 +123,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 30, execVerification + 40, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() From cbe13a5c6158af9d15dcdd7c8d02847b41775156 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 2 Apr 2024 18:00:35 +0300 Subject: [PATCH 2/2] Fix datacite input path in properties file --- .../eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index fe663b5e9..ded4fe409 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -34,6 +34,6 @@ oozie.wf.application.path=${oozieTopWfApplicationPath} crossrefInputPath=/data/bip-affiliations/crossref-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json -dataciteInputPath=/data/bip-affiliations/openapc-data.json +dataciteInputPath=/data/bip-affiliations/datacite-data.json outputPath=/tmp/crossref-affiliations-output-v5