From f0dc12634bac955c87d327b7666440a5b41ad635 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Sun, 18 Feb 2024 18:02:09 +0200 Subject: [PATCH] Add Action Set creation for affiliations inferred from the OpenAPC data --- .../PrepareAffiliationRelations.java | 9 +++++++++ .../bipaffiliations/input_actionset_parameter.json | 6 ++++++ .../actionmanager/bipaffiliations/job.properties | 3 ++- .../bipaffiliations/oozie_app/workflow.xml | 5 +++++ .../PrepareAffiliationRelationsTest.java | 13 +++++-------- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 1bdb06ecc..f9a5c539f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -64,6 +64,9 @@ public class PrepareAffiliationRelations implements Serializable { final String pubmedInputPath = parser.get("pubmedInputPath"); log.info("pubmedInputPath: {}", pubmedInputPath); + final String openapcInputPath = parser.get("openapcInputPath"); + log.info("openapcInputPath: {}", openapcInputPath); + final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); @@ -85,8 +88,14 @@ public class PrepareAffiliationRelations implements Serializable { JavaPairRDD pubmedRelations = prepareAffiliationRelations( spark, pubmedInputPath, collectedFromPubmed); + List collectedFromOpenAPC = OafMapperUtils + .listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC"); + JavaPairRDD openAPCRelations = prepareAffiliationRelations( + spark, openapcInputPath, collectedFromOpenAPC); + crossrefRelations .union(pubmedRelations) + .union(openAPCRelations) .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index c6f905199..5a91ace75 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -17,6 +17,12 @@ "paramDescription": "the path to get the input data from Pubmed", "paramRequired": true }, + { + "paramName": "oip", + "paramLongName": "openapcInputPath", + "paramDescription": "the path to get the input data from OpenAPC", + "paramRequired": true + }, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index a3d55ff0c..69c667732 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -31,6 +31,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen # The following is needed as a property of a workflow oozie.wf.application.path=${oozieTopWfApplicationPath} -crossrefInputPath=/data/bip-affiliations/data.json +crossrefInputPath=/data/bip-affiliations/crossref-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json +openapcInputPath=/data/bip-affiliations/openapc-data.json outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index c5ac6f884..e42437396 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -9,6 +9,10 @@ pubmedInputPath the path where to find the inferred affiliation relations from Pubmed + + openapcInputPath + the path where to find the inferred affiliation relations from OpenAPC + outputPath the path where to store the actionset @@ -102,6 +106,7 @@ --crossrefInputPath${crossrefInputPath} --pubmedInputPath${pubmedInputPath} + --openapcInputPath${openapcInputPath} --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index b87738879..c164aad8b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -78,10 +78,6 @@ public class PrepareAffiliationRelationsTest { .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getPath(); - String pubmedAffiliationRelationsPath = getClass() - .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") - .getPath(); - String outputPath = workingDir.toString() + "/actionSet"; PrepareAffiliationRelations @@ -89,7 +85,8 @@ public class PrepareAffiliationRelationsTest { new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-crossrefInputPath", crossrefAffiliationRelationPath, - "-pubmedInputPath", pubmedAffiliationRelationsPath, + "-pubmedInputPath", crossrefAffiliationRelationPath, + "-openapcInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); @@ -106,7 +103,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(40, tmp.count()); + assertEquals(60, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -117,7 +114,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 20, execVerification + 30, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -125,7 +122,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 20, execVerification + 30, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList()