new rels produced by dedup workflow must be unique

This commit is contained in:
Claudio Atzori 2020-05-08 19:00:38 +02:00
parent 0ccc864ad9
commit fd519df616
4 changed files with 42 additions and 1 deletions

View File

@ -86,7 +86,8 @@ public class SparkPropagateRelation extends AbstractSparkAction {
mergedIds, mergedIds,
FieldType.TARGET, FieldType.TARGET,
getFixRelFn(FieldType.TARGET)) getFixRelFn(FieldType.TARGET))
.filter(SparkPropagateRelation::containsDedup); .filter(SparkPropagateRelation::containsDedup)
.distinct();
Dataset<Relation> updated = processDataset( Dataset<Relation> updated = processDataset(
processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()),

View File

@ -12,12 +12,14 @@ import java.io.Serializable;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Paths; import java.nio.file.Paths;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -450,6 +452,22 @@ public class SparkDedupTest implements Serializable {
assertEquals(updated, deletedbyinference); assertEquals(updated, deletedbyinference);
} }
@Test
@Order(6)
public void testRelations() throws Exception {
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
}
private void testUniqueness(String path, int expected_total, int expected_unique) {
Dataset<Relation> rel = spark.read()
.textFile(getClass().getResource(path).getPath())
.map((MapFunction<String, Relation>) s -> new ObjectMapper().readValue(s, Relation.class), Encoders.bean(Relation.class));
assertEquals(expected_total, rel.count());
assertEquals(expected_unique, rel.distinct().count());
}
@AfterAll @AfterAll
public static void finalCleanUp() throws IOException { public static void finalCleanUp() throws IOException {
FileUtils.deleteDirectory(new File(testOutputBasePath)); FileUtils.deleteDirectory(new File(testOutputBasePath));

View File

@ -0,0 +1,12 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::40c7b1dfa18c3693d374dafd21ef852f","subRelType":"provision","target":"10|doajarticles::618df40624078491acfd93ca3ff6921c"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::0b4e756a73338f60b84de98d080f6422","subRelType":"provision","target":"10|doajarticles::6d01e689db13b6977b411f4170b6143b"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::fe2f7c9d350b9c5aa658ec384d761e33","subRelType":"provision","target":"10|doajarticles::9b8a956b0703854ba79e52ddf7dc552e"}
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::a116734108ba011ef715b012f095e3f5","subRelType":"provision","target":"10|doajarticles::c5de04b1a35da2cc4468e299bc9ffa16"}
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::8b83abbbcad5496fe43cda88d0045aa4","subRelType":"provision","target":"10|opendoar____::6855456e2fe46a9d49d3d3af4f57443d"}
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::88034de0247d9d36e22783e9319c5ba3","subRelType":"provision","target":"10|opendoar____::c17028c9b6e0c5deaad29665d582284a"}
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::dfb21c796f33e9acf505cc960a3d8d2c","subRelType":"provision","target":"10|opendoar____::dfa037a53e121ecc9e0926800c3e814e"}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::b526b1aa1562038881a31be59896985f","subRelType":"provision","target":"10|re3data_____::2e457773b62df3534cc04441bf406a70"}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::6b306183bc051b5aaa5376f2fab6e6e5","subRelType":"provision","target":"10|re3data_____::6371ff9ee1ec7073416cb83c868b10a3"}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}

View File

@ -0,0 +1,10 @@
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681629"}