Merge pull request 'Extract Information from Transformative Agreement' (#371) from transformativeagreement into beta

Reviewed-on: #371
2024-03-25 15:42:36 +01:00 · 2024-03-25 15:42:36 +01:00 · e0c315b07b
parent e1149eb5c4 54936b7f42
commit e0c315b07b
15 changed files with 97289 additions and 10 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java
@ -64,6 +64,9 @@ public class PrepareAffiliationRelations implements Serializable {
 		final String pubmedInputPath = parser.get("pubmedInputPath");
 		log.info("pubmedInputPath: {}", pubmedInputPath);

+		final String openapcInputPath = parser.get("openapcInputPath");
+		log.info("openapcInputPath: {}", openapcInputPath);
+
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);

@ -85,8 +88,14 @@ public class PrepareAffiliationRelations implements Serializable {
 				JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
 					spark, pubmedInputPath, collectedFromPubmed);

+				List<KeyValue> collectedFromOpenAPC = OafMapperUtils
+					.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
+				JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
+					spark, openapcInputPath, collectedFromOpenAPC);
+
 				crossrefRelations
 					.union(pubmedRelations)
+					.union(openAPCRelations)
 					.saveAsHadoopFile(
 						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);

--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateActionSetSparkJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateActionSetSparkJob.java
@ -0,0 +1,195 @@
+
+package eu.dnetlib.dhp.actionmanager.transformativeagreement;
+
+import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.*;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.FilterFunction;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.transformativeagreement.model.TransformativeAgreementModel;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Country;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.utils.*;
+import scala.Tuple2;
+
+public class CreateActionSetSparkJob implements Serializable {
+
+	private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
+
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static final String IREL_PROJECT = "40|100018998___::1e5e62235d094afd01cd56e65112fc63";
+	private static final String TRANSFORMATIVE_AGREEMENT = "openapc::transformativeagreement";
+
+	public static void main(final String[] args) throws IOException, ParseException {
+
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					Objects
+						.requireNonNull(
+							CreateActionSetSparkJob.class
+								.getResourceAsStream(
+									"/eu/dnetlib/dhp/actionmanager/transformativeagreement/as_parameters.json"))));
+
+		parser.parseArgument(args);
+
+		Boolean isSparkSessionManaged = Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+
+		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
+
+		final String inputPath = parser.get("inputPath");
+		log.info("inputPath {}", inputPath);
+
+		final String outputPath = parser.get("outputPath");
+		log.info("outputPath {}", outputPath);
+
+		SparkConf conf = new SparkConf();
+		runWithSparkSession(
+			conf,
+			isSparkSessionManaged,
+			spark -> createActionSet(spark, inputPath, outputPath));
+
+	}
+
+	private static void createActionSet(SparkSession spark, String inputPath, String outputPath) {
+		JavaRDD<AtomicAction> relations = spark
+				.read()
+				.textFile(inputPath)
+				.map(
+						(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
+								.readValue(value, TransformativeAgreementModel.class),
+						Encoders.bean(TransformativeAgreementModel.class))
+				.flatMap(
+						(FlatMapFunction<TransformativeAgreementModel, Relation>) value -> createRelation(
+								value)
+								.iterator(),
+						Encoders.bean(Relation.class))
+				.filter((FilterFunction<Relation>) Objects::nonNull)
+				.toJavaRDD()
+				.map(p -> new AtomicAction(p.getClass(), p));
+//TODO relations in stand-by waiting to know if we need to create them or not In case we need just make a union before saving the sequence file
+				spark
+					.read()
+					.textFile(inputPath)
+					.map(
+						(MapFunction<String, TransformativeAgreementModel>) value -> OBJECT_MAPPER
+							.readValue(value, TransformativeAgreementModel.class),
+						Encoders.bean(TransformativeAgreementModel.class))
+					.map(
+						(MapFunction<TransformativeAgreementModel, Result>) value -> createResult(
+							value),
+						Encoders.bean(Result.class))
+					.filter((FilterFunction<Result>) r -> r != null)
+					.toJavaRDD()
+					.map(p -> new AtomicAction(p.getClass(), p))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(
+				outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
+	}
+
+	private static Result createResult(TransformativeAgreementModel value) {
+		Result r = new Result();
+		r
+			.setId(
+				"50|doi_________::"
+					+ IdentifierFactory
+						.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), value.getDoi())));
+		r.setTransformativeAgreement(value.getAgreement());
+		Country country = new Country();
+		country.setClassid(value.getCountry());
+		country.setClassname(value.getCountry());
+		country
+			.setDataInfo(
+				OafMapperUtils
+					.dataInfo(
+						false, ModelConstants.SYSIMPORT_ACTIONSET, false, false,
+						OafMapperUtils
+							.qualifier(
+								"openapc::transformativeagreement",
+								"Harvested from Trnasformative Agreement file from OpenAPC",
+								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+						"0.9"));
+		country.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
+		country.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
+		r.setCountry(Arrays.asList(country));
+		return r;
+	}
+
+	private static List<Relation> createRelation(TransformativeAgreementModel value) {
+
+		List<Relation> relationList = new ArrayList<>();
+
+		if (value.getAgreement().startsWith("IReL")) {
+			String paper;
+
+			paper = "50|doi_________::"
+				+ IdentifierFactory
+					.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), value.getDoi()));
+
+			relationList
+				.add(
+					getRelation(
+						paper,
+						IREL_PROJECT, ModelConstants.IS_PRODUCED_BY));
+
+			relationList.add(getRelation(IREL_PROJECT, paper, ModelConstants.PRODUCES));
+		}
+		return relationList;
+	}
+
+	public static Relation getRelation(
+		String source,
+		String target,
+		String relClass) {
+
+		return OafMapperUtils
+			.getRelation(
+				source,
+				target,
+				ModelConstants.RESULT_PROJECT,
+				ModelConstants.OUTCOME,
+				relClass,
+				Arrays
+					.asList(
+						OafMapperUtils.keyValue(ModelConstants.OPEN_APC_ID, ModelConstants.OPEN_APC_NAME)),
+				OafMapperUtils
+					.dataInfo(
+						false, null, false, false,
+						OafMapperUtils
+							.qualifier(
+								TRANSFORMATIVE_AGREEMENT, "Transformative Agreement",
+								ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
+						"0.9"),
+				null);
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/model/TransformativeAgreementModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/model/TransformativeAgreementModel.java
@ -0,0 +1,51 @@
+
+package eu.dnetlib.dhp.actionmanager.transformativeagreement.model;
+
+import java.io.Serializable;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+
+/**
+ * @author miriam.baglioni
+ * @Date 18/12/23
+ */
+@JsonIgnoreProperties(ignoreUnknown = true)
+
+public class TransformativeAgreementModel implements Serializable {
+	private String institution;
+	private String doi;
+	private String agreement;
+	private String country;
+
+	public String getCountry() {
+		return country;
+	}
+
+	public void setCountry(String country) {
+		this.country = country;
+	}
+
+	public String getInstitution() {
+		return institution;
+	}
+
+	public void setInstitution(String institution) {
+		this.institution = institution;
+	}
+
+	public String getDoi() {
+		return doi;
+	}
+
+	public void setDoi(String doi) {
+		this.doi = doi;
+	}
+
+	public String getAgreement() {
+		return agreement;
+	}
+
+	public void setAgreement(String agreement) {
+		this.agreement = agreement;
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json
@ -17,6 +17,12 @@
    "paramDescription": "the path to get the input data from Pubmed",
    "paramRequired": true
  },
+  {
+    "paramName": "oip",
+    "paramLongName": "openapcInputPath",
+    "paramDescription": "the path to get the input data from OpenAPC",
+    "paramRequired": true
+  },
  {
    "paramName": "o",
    "paramLongName": "outputPath",
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties
@ -31,6 +31,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
 # The following is needed as a property of a workflow
 oozie.wf.application.path=${oozieTopWfApplicationPath}

-crossrefInputPath=/data/bip-affiliations/data.json
+crossrefInputPath=/data/bip-affiliations/crossref-data.json
 pubmedInputPath=/data/bip-affiliations/pubmed-data.json
+openapcInputPath=/data/bip-affiliations/openapc-data.json
 outputPath=/tmp/crossref-affiliations-output-v5
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml
@ -9,6 +9,10 @@
            <name>pubmedInputPath</name>
            <description>the path where to find the inferred affiliation relations from Pubmed</description>
        </property>
+        <property>
+            <name>openapcInputPath</name>
+            <description>the path where to find the inferred affiliation relations from OpenAPC</description>
+        </property>
        <property>
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
@ -102,6 +106,7 @@
            </spark-opts>
            <arg>--crossrefInputPath</arg><arg>${crossrefInputPath}</arg>
            <arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
+            <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
            <arg>--outputPath</arg><arg>${outputPath}</arg>
        </spark>
        <ok to="End"/>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/as_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/as_parameters.json
@ -0,0 +1,20 @@
+[
+  {
+    "paramName": "ip",
+    "paramLongName": "inputPath",
+    "paramDescription": "the zipped opencitations file",
+    "paramRequired": true
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "the working path",
+    "paramRequired": true
+  },
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  }
+]
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/input_read_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/input_read_parameters.json
@ -0,0 +1,30 @@
+[
+
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "delimiter",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": false
+  },
+  {
+    "paramName": "op",
+    "paramLongName": "outputPath",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": true
+  },
+  {
+    "paramName": "if",
+    "paramLongName": "inputFile",
+    "paramDescription": "the hdfs name node",
+    "paramRequired": true
+  }
+]
+
+
+
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/config-default.xml
@ -0,0 +1,58 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+    <property>
+        <name>hive_metastore_uris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>spark2YarnHistoryServerAddress</name>
+        <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
+    </property>
+    <property>
+        <name>spark2ExtraListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+    </property>
+    <property>
+        <name>spark2SqlQueryExecutionListeners</name>
+        <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>sparkExecutorNumber</name>
+        <value>4</value>
+    </property>
+    <property>
+        <name>spark2EventLogDir</name>
+        <value>/user/spark/spark2ApplicationHistory</value>
+    </property>
+    <property>
+        <name>sparkDriverMemory</name>
+        <value>15G</value>
+    </property>
+    <property>
+        <name>sparkExecutorMemory</name>
+        <value>6G</value>
+    </property>
+    <property>
+        <name>sparkExecutorCores</name>
+        <value>1</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/download.sh
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/download.sh
@ -0,0 +1,2 @@
+#!/bin/bash
+curl -L $1  | hdfs dfs -put - $2
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml
@ -0,0 +1,82 @@
+<workflow-app name="Transfomative Agreement Integration" xmlns="uri:oozie:workflow:0.5">
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+
+        </configuration>
+    </global>
+
+    <start to="resume_from"/>
+
+    <decision name="resume_from">
+        <switch>
+            <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
+            <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
+        </switch>
+    </decision>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+
+    <action name="download">
+        <shell xmlns="uri:oozie:shell-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <configuration>
+                <property>
+                    <name>mapred.job.queue.name</name>
+                    <value>${queueName}</value>
+                </property>
+            </configuration>
+            <exec>download.sh</exec>
+            <argument>${inputFile}</argument>
+            <argument>${workingDir}/transformativeagreement/transformativeAgreement.json</argument>
+            <env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
+            <file>download.sh</file>
+            <capture-output/>
+        </shell>
+        <ok to="create_actionset"/>
+        <error to="Kill"/>
+    </action>
+
+
+    <action name="create_actionset">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the AS for the Transformative Agreement</name>
+            <class>eu.dnetlib.dhp.actionmanager.transformativeagreement.CreateActionSetSparkJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${workingDir}/transformativeagreement/</arg>
+            <arg>--outputPath</arg><arg>${outputPath}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java
@ -78,10 +78,6 @@ public class PrepareAffiliationRelationsTest {
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
 			.getPath();

-		String pubmedAffiliationRelationsPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
-			.getPath();
-
 		String outputPath = workingDir.toString() + "/actionSet";

 		PrepareAffiliationRelations
@ -89,7 +85,8 @@ public class PrepareAffiliationRelationsTest {
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-crossrefInputPath", crossrefAffiliationRelationPath,
-					"-pubmedInputPath", pubmedAffiliationRelationsPath,
+					"-pubmedInputPath", crossrefAffiliationRelationPath,
+					"-openapcInputPath", crossrefAffiliationRelationPath,
 					"-outputPath", outputPath
 				});

@ -106,7 +103,7 @@ public class PrepareAffiliationRelationsTest {
 //            );
 //        }
 		// count the number of relations
-		assertEquals(40, tmp.count());
+		assertEquals(60, tmp.count());

 		Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
 		dataset.createOrReplaceTempView("result");
@ -117,7 +114,7 @@ public class PrepareAffiliationRelationsTest {
 		// verify that we have equal number of bi-directional relations
 		Assertions
 			.assertEquals(
-				20, execVerification
+				30, execVerification
 					.filter(
 						"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
 					.collectAsList()
@ -125,7 +122,7 @@ public class PrepareAffiliationRelationsTest {

 		Assertions
 			.assertEquals(
-				20, execVerification
+				30, execVerification
 					.filter(
 						"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
 					.collectAsList()
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateTAActionSetTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/transformativeagreement/CreateTAActionSetTest.java
@ -0,0 +1,324 @@
+
+package eu.dnetlib.dhp.actionmanager.transformativeagreement;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob;
+import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
+import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
+
+/**
+ * @author miriam.baglioni
+ * @Date 13/02/24
+ */
+public class CreateTAActionSetTest {
+	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private static SparkSession spark;
+
+	private static Path workingDir;
+	private static final Logger log = LoggerFactory
+		.getLogger(CreateOpenCitationsASTest.class);
+
+	@BeforeAll
+	public static void beforeAll() throws IOException {
+		workingDir = Files
+			.createTempDirectory(CreateTAActionSetTest.class.getSimpleName());
+		log.info("using work dir {}", workingDir);
+
+		SparkConf conf = new SparkConf();
+		conf.setAppName(CreateTAActionSetTest.class.getSimpleName());
+
+		conf.setMaster("local[*]");
+		conf.set("spark.driver.host", "localhost");
+		conf.set("hive.metastore.local", "true");
+		conf.set("spark.ui.enabled", "false");
+		conf.set("spark.sql.warehouse.dir", workingDir.toString());
+		conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
+
+		spark = SparkSession
+			.builder()
+			.appName(CreateTAActionSetTest.class.getSimpleName())
+			.config(conf)
+			.getOrCreate();
+	}
+
+	@AfterAll
+	public static void afterAll() throws IOException {
+		FileUtils.deleteDirectory(workingDir.toFile());
+		spark.stop();
+	}
+
+	@Test
+	void createActionSet() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/transformativeagreement/facts.json")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.transformativeagreement.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet1"
+				});
+
+	}
+
+	@Test
+	void testNumberofRelations2() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet2"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet2", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		assertEquals(23, tmp.count());
+
+		// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
+
+	}
+
+	@Test
+	void testRelationsCollectedFrom() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet3"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet3", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> {
+			assertEquals(ModelConstants.OPENOCITATIONS_NAME, r.getCollectedfrom().get(0).getValue());
+			assertEquals(ModelConstants.OPENOCITATIONS_ID, r.getCollectedfrom().get(0).getKey());
+		});
+
+	}
+
+	@Test
+	void testRelationsDataInfo() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet4"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet4", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> {
+			assertEquals(false, r.getDataInfo().getInferred());
+			assertEquals(false, r.getDataInfo().getDeletedbyinference());
+			assertEquals("0.91", r.getDataInfo().getTrust());
+			assertEquals(
+				eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob.OPENCITATIONS_CLASSID,
+				r.getDataInfo().getProvenanceaction().getClassid());
+			assertEquals(
+				eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob.OPENCITATIONS_CLASSNAME,
+				r.getDataInfo().getProvenanceaction().getClassname());
+			assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemeid());
+			assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemename());
+		});
+
+	}
+
+	@Test
+	void testRelationsSemantics() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet5"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet5", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> {
+			assertEquals("citation", r.getSubRelType());
+			assertEquals("resultResult", r.getRelType());
+		});
+		assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
+		assertEquals(0, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
+
+	}
+
+	@Test
+	void testRelationsSourceTargetPrefix() throws Exception {
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet6"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet6", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		tmp.foreach(r -> {
+			assertEquals("50|doi_________::", r.getSource().substring(0, 17));
+			assertEquals("50|doi_________::", r.getTarget().substring(0, 17));
+		});
+
+	}
+
+	@Test
+	void testRelationsSourceTargetCouple() throws Exception {
+		final String doi1 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
+		final String doi2 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
+		final String doi3 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
+		final String doi4 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
+		final String doi5 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
+		final String doi6 = "50|doi_________::"
+			+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
+
+		String inputPath = getClass()
+			.getResource(
+				"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
+			.getPath();
+
+		CreateActionSetSparkJob
+			.main(
+				new String[] {
+					"-isSparkSessionManaged",
+					Boolean.FALSE.toString(),
+					"-inputPath",
+					inputPath,
+					"-outputPath",
+					workingDir.toString() + "/actionSet7"
+				});
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Relation> tmp = sc
+			.sequenceFile(workingDir.toString() + "/actionSet7", Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Relation) aa.getPayload()));
+
+		JavaRDD<Relation> check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1));
+
+		assertEquals(5, check.count());
+
+//		check.foreach(r -> {
+//			if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) ||
+//				r.getSource().equals(doi5) || r.getSource().equals(doi6)) {
+//				assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass());
+//				assertEquals(doi1, r.getTarget());
+//			}
+//		});
+
+		assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count());
+		check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass()));
+
+	}
+}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/facts.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/facts.json
--- a/pom.xml
+++ b/pom.xml
@ -888,7 +888,7 @@
 		<mockito-core.version>3.3.3</mockito-core.version>
 		<mongodb.driver.version>3.4.2</mongodb.driver.version>
 		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[4.17.2]</dhp-schemas.version>
+		<dhp-schemas.version>[5.17.3]</dhp-schemas.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
 		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>