From 6b19dcee80cdc19b6acdd95e6b05e5a4093b3ba7 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Thu, 19 Oct 2023 19:58:25 +0300 Subject: [PATCH] Add actionset creation for pubmed affiliations --- .../PrepareAffiliationRelations.java | 32 ++++++++++++------- .../input_actionset_parameter.json | 8 ++++- .../bipaffiliations/job.properties | 1 + .../bipaffiliations/oozie_app/workflow.xml | 7 +++- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 603ad6339..cbfba30c5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; @@ -58,10 +59,13 @@ public class PrepareAffiliationRelations implements Serializable { log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("inputPath"); - log.info("inputPath {}: ", inputPath); + log.info("inputPath: {}", inputPath); + + final String pubmedInputPath = parser.get("pubmedInputPath"); + log.info("pubmedInputPath: {}", pubmedInputPath); final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + log.info("outputPath: {}", outputPath); SparkConf conf = new SparkConf(); @@ -70,12 +74,22 @@ public class PrepareAffiliationRelations implements Serializable { isSparkSessionManaged, spark -> { Constants.removeOutputDir(spark, outputPath); - prepareAffiliationRelations(spark, inputPath, outputPath); + + List collectedFromCrossref = OafMapperUtils.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref"); + JavaPairRDD crossrefRelations = prepareAffiliationRelations(spark, inputPath, collectedFromCrossref); + + List collectedFromPubmed = OafMapperUtils.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed"); + JavaPairRDD pubmedRelations = prepareAffiliationRelations(spark, inputPath, collectedFromPubmed); + + crossrefRelations + .union(pubmedRelations) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + }); } - private static void prepareAffiliationRelations(SparkSession spark, String inputPath, - String outputPath) { + private static JavaPairRDD prepareAffiliationRelations(SparkSession spark, String inputPath, + List collectedfrom) { // load and parse affiliation relations from HDFS Dataset df = spark @@ -92,7 +106,7 @@ public class PrepareAffiliationRelations implements Serializable { new Column("matching.Confidence").as("confidence")); // prepare action sets for affiliation relations - df + return df .toJavaRDD() .flatMap((FlatMapFunction) row -> { @@ -120,8 +134,6 @@ public class PrepareAffiliationRelations implements Serializable { qualifier, Double.toString(row.getAs("confidence"))); - List collectedfrom = OafMapperUtils.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref"); - // return bi-directional relations return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator(); @@ -129,9 +141,7 @@ public class PrepareAffiliationRelations implements Serializable { .map(p -> new AtomicAction(Relation.class, p)) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - + new Text(OBJECT_MAPPER.writeValueAsString(aa)))); } private static List getAffiliationRelationPair(String paperId, String affId, List collectedfrom, diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index 7663a454b..96dcc3b32 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -8,7 +8,13 @@ { "paramName": "ip", "paramLongName": "inputPath", - "paramDescription": "the URL from where to get the programme file", + "paramDescription": "the path to get the input data from Crossref", + "paramRequired": true + }, + { + "paramName": "pip", + "paramLongName": "pubmedInputPath", + "paramDescription": "the path to get the input data from Pubmed", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index d942e6772..fe3cbb633 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -32,4 +32,5 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen oozie.wf.application.path=${oozieTopWfApplicationPath} inputPath=/data/bip-affiliations/data.json +pubmedInputPath=/data/bip-affiiations/pubmed-data.json outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index 9930cfe17..c0a6bfc52 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -3,7 +3,11 @@ inputPath - the path where to find the inferred affiliation relations + the path where to find the inferred affiliation relations from Crossref + + + pubmedInputPath + the path where to find the inferred affiliation relations from Pubmed outputPath @@ -97,6 +101,7 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --inputPath${inputPath} + --pubmedInputPath${pubmedInputPath} --outputPath${outputPath}