[orcipPropagation] test to verify the merge for multiple enrichment from multiple sources. It covers also the check for persistency of other identifiers types

2024-12-20 14:41:09 +01:00 · 2024-12-20 14:41:09 +01:00 · 071cc95afb
parent 29611b2091
commit 071cc95afb
10 changed files with 46 additions and 85 deletions
--- a/dhp-common/src/main/scala/eu/dnetlib/dhp/common/author/SparkEnrichWithOrcidAuthors.scala
+++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/common/author/SparkEnrichWithOrcidAuthors.scala
@ -9,11 +9,7 @@ import org.slf4j.{Logger, LoggerFactory}
 import eu.dnetlib.dhp.common.enrichment.Constants.PROPAGATION_DATA_INFO_TYPE
 import eu.dnetlib.dhp.schema.oaf.{OafEntity, Result}
 import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils
-import org.apache.spark.api.java.function.{MapFunction, MapGroupsFunction}
-import org.apache.spark.sql.expressions.Aggregator

-import java.util
-import java.util.Iterator
 import scala.collection.JavaConverters._

 abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
@ -83,7 +79,8 @@ abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[Str
          .mapGroups((k, it) => {
            val p: Result = it.next

-            it.foldLeft(p.getAuthor)((x,r) => MergeUtils.mergeAuthors(x, r.getAuthor,0))
+            p.setAuthor(it.foldLeft(p.getAuthor)((x,r) => MergeUtils.mergeAuthors(x, r.getAuthor,0)))
+
            p

          })(Encoders.bean(resultClazz))
@ -94,7 +91,6 @@ abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[Str

      })
  }
-  // def generateGraph(spark: SparkSession, graphPath: String, workingDir: String, targetPath: String): Unit

  def createTemporaryData(spark: SparkSession, graphPath: String, orcidPath: String, targetPath: String): Unit

--- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkPropagateOrcidAuthor.java
+++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkPropagateOrcidAuthor.java
@ -6,14 +6,9 @@ import java.util.Objects;
 import java.util.Optional;
 import java.util.stream.Collectors;

-import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.oaf.*;
-import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
-import eu.dnetlib.dhp.utils.ORCIDAuthorEnricherResult;
 import org.apache.spark.api.java.function.FilterFunction;
-import org.apache.spark.api.java.function.ForeachFunction;
 import org.apache.spark.api.java.function.MapFunction;
-import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.*;
 import org.apache.spark.sql.Dataset;
 import org.slf4j.Logger;
@ -24,7 +19,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.common.ModelSupport;
 import eu.dnetlib.dhp.utils.OrcidAuthor;
 import scala.Tuple2;
-import static org.apache.spark.sql.functions.*;
+

 public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {
 	private static final Logger log = LoggerFactory.getLogger(SparkPropagateOrcidAuthor.class);
@ -89,69 +84,6 @@ public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {

 	}

-//	private <T extends Result> void processAndMerge(
-//			SparkSession spark,
-//			String inputPath,
-//			String outputPath,
-//			Class<T> clazz,
-//			Encoder<T> encoder) {
-//
-//		spark.read()
-//				.schema(Encoders.bean(clazz).schema())
-//				.json(inputPath)
-//				.as(encoder)
-//				.groupByKey((MapFunction<T, String>) OafEntity::getId, Encoders.STRING())
-//				.mapGroups((MapGroupsFunction<String, T, T>) (k, it) -> {
-//					T p = it.next();
-//					it.forEachRemaining(r -> p.setAuthor(MergeUtils.mergeAuthors(p.getAuthor(),r.getAuthor(),0)));
-//					return p;
-//				}, encoder)
-//				.write()
-//				.mode(SaveMode.Overwrite)
-//				.option("compression", "gzip")
-//				.json(outputPath);
-//	}
-//	@Override
-//public void generateGraph(SparkSession spark, String graphPath, String workingDir, String targetPath){
-//
-//	ModelSupport.entityTypes.keySet().stream().filter(ModelSupport::isResult)
-//				.forEach(e -> {
-//					Class resultClazz = ModelSupport.entityTypes.get(e);
-//					Dataset<Row> matched = spark
-//							.read()
-//							.schema(Encoders.bean(ORCIDAuthorEnricherResult.class).schema())
-//							.parquet(workingDir + "/" + e.name() + "_matched")
-//							.selectExpr("id","enriched_author");
-//					Dataset<Row> result = spark.read().schema(Encoders.bean(resultClazz).schema())
-//							.json(graphPath + "/" + e.name());
-//
-//
-//
-//	result.join(matched, result.col("id").equalTo(matched.col("id")), "left")
-//			.withColumn(
-//					"author",
-//					when(size(col("enriched_author")).gt(0), col("enriched_author"))
-//							.otherwise(col("author"))
-//			)
-//			.drop(matched.col("id"))
-//			.drop("enriched_author")
-//			.write()
-//			.mode(SaveMode.Overwrite)
-//			.option("compression", "gzip")
-//			.json(workingDir + "/" + e.name() + "/_tobemerged")
-//	;
-//					processAndMerge(
-//							spark,
-//							workingDir + "/" + e.name() + "/_tobemerged",
-//							targetPath + "/" + e.name(),
-//							resultClazz,
-//							Encoders.bean(resultClazz)
-//					);
-//				});
-//
-//
-//	}
-
 	@Override
 	public void createTemporaryData(SparkSession spark, String graphPath, String orcidPath, String targetPath) {
 		Dataset<Row> supplements = spark
@ -163,7 +95,6 @@ public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {
 								ModelConstants.IS_SUPPLEMENTED_BY + "')")
 				.selectExpr("source as id", "target");

-
 		Dataset<Row> result = spark
 				.read()
 				.schema(Encoders.bean(Result.class).schema())
--- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
+++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java
@ -183,16 +183,17 @@ public class OrcidPropagationJobTest {
 						.getResource(
 							"/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates")
 						.getPath(),
-					"-targetPath",
-					workingDir.toString() + "/dataset",
-					"-orcidPath", "",
-					"-workingDir", workingDir.toString()
+						"-targetPath",
+						workingDir.toString() + "/graph",
+						"-orcidPath", "",
+						"-workingDir", workingDir.toString(),
+						"-matchingSource", "propagation"
 				});

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

 		JavaRDD<Dataset> tmp = sc
-			.textFile(workingDir.toString() + "/dataset")
+			.textFile(workingDir.toString() + "/graph/dataset")
 			.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));

 		Assertions.assertEquals(10, tmp.count());
@ -214,10 +215,10 @@ public class OrcidPropagationJobTest {

 		Assertions
 			.assertEquals(
-				1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
+				1, propagatedAuthors.filter("name = 'Nicole' and surname = 'Jung'").count());
 		Assertions
 			.assertEquals(
-				1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count());
+				1, propagatedAuthors.filter("name = 'Camilo José' and surname = 'Cela'").count());

 		query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType "
 			+ "from dataset "
@ -228,13 +229,27 @@ public class OrcidPropagationJobTest {

 		Assertions
 			.assertEquals(
-				2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
+				3, authorsExplodedPids.filter("name = 'Camilo José' and surname = 'Cela'").count());
 		Assertions
 			.assertEquals(
 				1,
 				authorsExplodedPids
 					.filter(
-						"name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'")
+						"name = 'Camilo José' and surname = 'Cela' and pidType = 'MAG Identifier'")
 					.count());
+		Assertions
+				.assertEquals(
+						1,
+						authorsExplodedPids
+								.filter(
+										"name = 'Camilo José' and surname = 'Cela' and pidType = 'orcid'")
+								.count());
+		Assertions
+				.assertEquals(
+						1,
+						authorsExplodedPids
+								.filter(
+										"name = 'Camilo José' and surname = 'Cela' and pidType = 'orcid_pending'")
+								.count());
 	}
 }
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset/dataset_10.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset/dataset_10.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/otherresearchproduct/empty.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/otherresearchproduct/empty.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/publication/empty.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/publication/empty.json
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/relation/relation.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/relation/relation.json
@ -0,0 +1,9 @@
+{"subRelType": "supplement", "relClass": "IsSupplementedBy", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "target": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "source": "50|dedup_wf_001::36bcfaa1494c849547a346da688ade24", "collectedfrom": [], "validated": false, "properties": []}
+{"subRelType": "supplement", "relClass": "IsSupplementTo", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "source": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "target": "50|dedup_wf_001::36bcfaa1494c849547a346da688ade24", "collectedfrom": [], "validated": false, "properties": []}
+{"subRelType": "supplement", "relClass": "IsSupplementedBy", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "target": "50|od______3989::0f89464c4ac4c398fe0c71433b175a62", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "source": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "collectedfrom": [], "validated": false, "properties": []}
+{"subRelType": "supplement", "relClass": "IsSupplementTo", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "source": "50|od______3989::0f89464c4ac4c398fe0c71433b175a62", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "target": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "collectedfrom": [], "validated": false, "properties": []}
+
+
+
+
+
--- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/software/empty.json
+++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/software/empty.json