From 6ce9b600c1ec3a68328f0f814f895aee4edd421c Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Thu, 19 Oct 2023 19:58:25 +0300
Subject: [PATCH] Add actionset creation for pubmed affiliations

---
 .../PrepareAffiliationRelations.java          | 32 ++++++++++++-------
 .../input_actionset_parameter.json            |  8 ++++-
 .../bipaffiliations/job.properties            |  1 +
 .../bipaffiliations/oozie_app/workflow.xml    |  7 +++-
 4 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
index 603ad6339..cbfba30c5 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
@@ -58,10 +59,13 @@ public class PrepareAffiliationRelations implements Serializable {
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 
 		final String inputPath = parser.get("inputPath");
-		log.info("inputPath {}: ", inputPath);
+		log.info("inputPath: {}", inputPath);
+
+		final String pubmedInputPath = parser.get("pubmedInputPath");
+		log.info("pubmedInputPath: {}", pubmedInputPath);
 
 		final String outputPath = parser.get("outputPath");
-		log.info("outputPath {}: ", outputPath);
+		log.info("outputPath: {}", outputPath);
 
 		SparkConf conf = new SparkConf();
 
@@ -70,12 +74,22 @@ public class PrepareAffiliationRelations implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				Constants.removeOutputDir(spark, outputPath);
-				prepareAffiliationRelations(spark, inputPath, outputPath);
+
+				List<KeyValue> collectedFromCrossref = OafMapperUtils.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
+				JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(spark, inputPath, collectedFromCrossref);
+
+				List<KeyValue> collectedFromPubmed = OafMapperUtils.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
+				JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(spark, inputPath, collectedFromPubmed);
+
+				crossrefRelations
+						.union(pubmedRelations)
+							.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
 			});
 	}
 
-	private static <I extends Result> void prepareAffiliationRelations(SparkSession spark, String inputPath,
-		String outputPath) {
+	private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark, String inputPath,
+																						  List<KeyValue> collectedfrom) {
 
 		// load and parse affiliation relations from HDFS
 		Dataset<Row> df = spark
@@ -92,7 +106,7 @@ public class PrepareAffiliationRelations implements Serializable {
 				new Column("matching.Confidence").as("confidence"));
 
 		// prepare action sets for affiliation relations
-		df
+		return df
 			.toJavaRDD()
 			.flatMap((FlatMapFunction<Row, Relation>) row -> {
 
@@ -120,8 +134,6 @@ public class PrepareAffiliationRelations implements Serializable {
 						qualifier,
 						Double.toString(row.getAs("confidence")));
 
-				List<KeyValue> collectedfrom = OafMapperUtils.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
-
 				// return bi-directional relations
 				return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
 
@@ -129,9 +141,7 @@ public class PrepareAffiliationRelations implements Serializable {
 			.map(p -> new AtomicAction(Relation.class, p))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
-
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))));
 	}
 
 	private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
index 7663a454b..96dcc3b32 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
@@ -8,7 +8,13 @@
   {
     "paramName": "ip",
     "paramLongName": "inputPath",
-    "paramDescription": "the URL from where to get the programme file",
+    "paramDescription": "the path to get the input data from Crossref",
+    "paramRequired": true
+  },
+  {
+    "paramName": "pip",
+    "paramLongName": "pubmedInputPath",
+    "paramDescription": "the path to get the input data from Pubmed",
     "paramRequired": true
   },
   {
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
index d942e6772..fe3cbb633 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
@@ -32,4 +32,5 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
 oozie.wf.application.path=${oozieTopWfApplicationPath}
 
 inputPath=/data/bip-affiliations/data.json
+pubmedInputPath=/data/bip-affiiations/pubmed-data.json
 outputPath=/tmp/crossref-affiliations-output-v5
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
index 9930cfe17..c0a6bfc52 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
@@ -3,7 +3,11 @@
 
         <property>
             <name>inputPath</name>
-            <description>the path where to find the inferred affiliation relations</description>
+            <description>the path where to find the inferred affiliation relations from Crossref</description>
+        </property>
+        <property>
+            <name>pubmedInputPath</name>
+            <description>the path where to find the inferred affiliation relations from Pubmed</description>
         </property>
         <property>
             <name>outputPath</name>
@@ -97,6 +101,7 @@
                 --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
             </spark-opts>
             <arg>--inputPath</arg><arg>${inputPath}</arg>
+            <arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
             <arg>--outputPath</arg><arg>${outputPath}</arg>
         </spark>
         <ok to="End"/>