[orcipPropagation] test to verify the merge for multiple enrichment from multiple sources. It covers also the check for persistency of other identifiers types

This commit is contained in:
Miriam Baglioni 2024-12-20 14:41:09 +01:00
parent 29611b2091
commit 071cc95afb
10 changed files with 46 additions and 85 deletions

View File

@ -9,11 +9,7 @@ import org.slf4j.{Logger, LoggerFactory}
import eu.dnetlib.dhp.common.enrichment.Constants.PROPAGATION_DATA_INFO_TYPE
import eu.dnetlib.dhp.schema.oaf.{OafEntity, Result}
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils
import org.apache.spark.api.java.function.{MapFunction, MapGroupsFunction}
import org.apache.spark.sql.expressions.Aggregator
import java.util
import java.util.Iterator
import scala.collection.JavaConverters._
abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[String], log: Logger)
@ -83,7 +79,8 @@ abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[Str
.mapGroups((k, it) => {
val p: Result = it.next
it.foldLeft(p.getAuthor)((x,r) => MergeUtils.mergeAuthors(x, r.getAuthor,0))
p.setAuthor(it.foldLeft(p.getAuthor)((x,r) => MergeUtils.mergeAuthors(x, r.getAuthor,0)))
p
})(Encoders.bean(resultClazz))
@ -94,7 +91,6 @@ abstract class SparkEnrichWithOrcidAuthors(propertyPath: String, args: Array[Str
})
}
// def generateGraph(spark: SparkSession, graphPath: String, workingDir: String, targetPath: String): Unit
def createTemporaryData(spark: SparkSession, graphPath: String, orcidPath: String, targetPath: String): Unit

View File

@ -6,14 +6,9 @@ import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
import eu.dnetlib.dhp.utils.ORCIDAuthorEnricherResult;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
@ -24,7 +19,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.OrcidAuthor;
import scala.Tuple2;
import static org.apache.spark.sql.functions.*;
public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {
private static final Logger log = LoggerFactory.getLogger(SparkPropagateOrcidAuthor.class);
@ -89,69 +84,6 @@ public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {
}
// private <T extends Result> void processAndMerge(
// SparkSession spark,
// String inputPath,
// String outputPath,
// Class<T> clazz,
// Encoder<T> encoder) {
//
// spark.read()
// .schema(Encoders.bean(clazz).schema())
// .json(inputPath)
// .as(encoder)
// .groupByKey((MapFunction<T, String>) OafEntity::getId, Encoders.STRING())
// .mapGroups((MapGroupsFunction<String, T, T>) (k, it) -> {
// T p = it.next();
// it.forEachRemaining(r -> p.setAuthor(MergeUtils.mergeAuthors(p.getAuthor(),r.getAuthor(),0)));
// return p;
// }, encoder)
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(outputPath);
// }
// @Override
//public void generateGraph(SparkSession spark, String graphPath, String workingDir, String targetPath){
//
// ModelSupport.entityTypes.keySet().stream().filter(ModelSupport::isResult)
// .forEach(e -> {
// Class resultClazz = ModelSupport.entityTypes.get(e);
// Dataset<Row> matched = spark
// .read()
// .schema(Encoders.bean(ORCIDAuthorEnricherResult.class).schema())
// .parquet(workingDir + "/" + e.name() + "_matched")
// .selectExpr("id","enriched_author");
// Dataset<Row> result = spark.read().schema(Encoders.bean(resultClazz).schema())
// .json(graphPath + "/" + e.name());
//
//
//
// result.join(matched, result.col("id").equalTo(matched.col("id")), "left")
// .withColumn(
// "author",
// when(size(col("enriched_author")).gt(0), col("enriched_author"))
// .otherwise(col("author"))
// )
// .drop(matched.col("id"))
// .drop("enriched_author")
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(workingDir + "/" + e.name() + "/_tobemerged")
// ;
// processAndMerge(
// spark,
// workingDir + "/" + e.name() + "/_tobemerged",
// targetPath + "/" + e.name(),
// resultClazz,
// Encoders.bean(resultClazz)
// );
// });
//
//
// }
@Override
public void createTemporaryData(SparkSession spark, String graphPath, String orcidPath, String targetPath) {
Dataset<Row> supplements = spark
@ -163,7 +95,6 @@ public class SparkPropagateOrcidAuthor extends SparkEnrichWithOrcidAuthors {
ModelConstants.IS_SUPPLEMENTED_BY + "')")
.selectExpr("source as id", "target");
Dataset<Row> result = spark
.read()
.schema(Encoders.bean(Result.class).schema())

View File

@ -183,16 +183,17 @@ public class OrcidPropagationJobTest {
.getResource(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates")
.getPath(),
"-targetPath",
workingDir.toString() + "/dataset",
"-orcidPath", "",
"-workingDir", workingDir.toString()
"-targetPath",
workingDir.toString() + "/graph",
"-orcidPath", "",
"-workingDir", workingDir.toString(),
"-matchingSource", "propagation"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.textFile(workingDir.toString() + "/graph/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
@ -214,10 +215,10 @@ public class OrcidPropagationJobTest {
Assertions
.assertEquals(
1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
1, propagatedAuthors.filter("name = 'Nicole' and surname = 'Jung'").count());
Assertions
.assertEquals(
1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count());
1, propagatedAuthors.filter("name = 'Camilo José' and surname = 'Cela'").count());
query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType "
+ "from dataset "
@ -228,13 +229,27 @@ public class OrcidPropagationJobTest {
Assertions
.assertEquals(
2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count());
3, authorsExplodedPids.filter("name = 'Camilo José' and surname = 'Cela'").count());
Assertions
.assertEquals(
1,
authorsExplodedPids
.filter(
"name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'")
"name = 'Camilo José' and surname = 'Cela' and pidType = 'MAG Identifier'")
.count());
Assertions
.assertEquals(
1,
authorsExplodedPids
.filter(
"name = 'Camilo José' and surname = 'Cela' and pidType = 'orcid'")
.count());
Assertions
.assertEquals(
1,
authorsExplodedPids
.filter(
"name = 'Camilo José' and surname = 'Cela' and pidType = 'orcid_pending'")
.count());
}
}

View File

@ -0,0 +1,9 @@
{"subRelType": "supplement", "relClass": "IsSupplementedBy", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "target": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "source": "50|dedup_wf_001::36bcfaa1494c849547a346da688ade24", "collectedfrom": [], "validated": false, "properties": []}
{"subRelType": "supplement", "relClass": "IsSupplementTo", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "source": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "target": "50|dedup_wf_001::36bcfaa1494c849547a346da688ade24", "collectedfrom": [], "validated": false, "properties": []}
{"subRelType": "supplement", "relClass": "IsSupplementedBy", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "target": "50|od______3989::0f89464c4ac4c398fe0c71433b175a62", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "source": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "collectedfrom": [], "validated": false, "properties": []}
{"subRelType": "supplement", "relClass": "IsSupplementTo", "dataInfo": {"provenanceaction": {"classid": "iis", "classname": "Inferred by OpenAIRE", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": true, "inferenceprovenance": "iis::document_affiliations", "invisible": false, "trust": "0.7731"}, "source": "50|od______3989::0f89464c4ac4c398fe0c71433b175a62", "lastupdatetimestamp": 1694431186898, "relType": "resultOrganization", "target": "50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e", "collectedfrom": [], "validated": false, "properties": []}