forked from D-Net/dnet-hadoop
new rels produced by dedup workflow must be unique
This commit is contained in:
parent
0ccc864ad9
commit
fd519df616
|
@ -86,7 +86,8 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
mergedIds,
|
mergedIds,
|
||||||
FieldType.TARGET,
|
FieldType.TARGET,
|
||||||
getFixRelFn(FieldType.TARGET))
|
getFixRelFn(FieldType.TARGET))
|
||||||
.filter(SparkPropagateRelation::containsDedup);
|
.filter(SparkPropagateRelation::containsDedup)
|
||||||
|
.distinct();
|
||||||
|
|
||||||
Dataset<Relation> updated = processDataset(
|
Dataset<Relation> updated = processDataset(
|
||||||
processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()),
|
processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()),
|
||||||
|
|
|
@ -12,12 +12,14 @@ import java.io.Serializable;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
@ -450,6 +452,22 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(updated, deletedbyinference);
|
assertEquals(updated, deletedbyinference);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(6)
|
||||||
|
public void testRelations() throws Exception {
|
||||||
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_1.json", 12, 10);
|
||||||
|
testUniqueness("/eu/dnetlib/dhp/dedup/test/relation_2.json", 10, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testUniqueness(String path, int expected_total, int expected_unique) {
|
||||||
|
Dataset<Relation> rel = spark.read()
|
||||||
|
.textFile(getClass().getResource(path).getPath())
|
||||||
|
.map((MapFunction<String, Relation>) s -> new ObjectMapper().readValue(s, Relation.class), Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
assertEquals(expected_total, rel.count());
|
||||||
|
assertEquals(expected_unique, rel.distinct().count());
|
||||||
|
}
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
public static void finalCleanUp() throws IOException {
|
public static void finalCleanUp() throws IOException {
|
||||||
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::40c7b1dfa18c3693d374dafd21ef852f","subRelType":"provision","target":"10|doajarticles::618df40624078491acfd93ca3ff6921c"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::0b4e756a73338f60b84de98d080f6422","subRelType":"provision","target":"10|doajarticles::6d01e689db13b6977b411f4170b6143b"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::fe2f7c9d350b9c5aa658ec384d761e33","subRelType":"provision","target":"10|doajarticles::9b8a956b0703854ba79e52ddf7dc552e"}
|
||||||
|
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|doajarticles::a116734108ba011ef715b012f095e3f5","subRelType":"provision","target":"10|doajarticles::c5de04b1a35da2cc4468e299bc9ffa16"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::8b83abbbcad5496fe43cda88d0045aa4","subRelType":"provision","target":"10|opendoar____::6855456e2fe46a9d49d3d3af4f57443d"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::88034de0247d9d36e22783e9319c5ba3","subRelType":"provision","target":"10|opendoar____::c17028c9b6e0c5deaad29665d582284a"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|opendoar____::dfb21c796f33e9acf505cc960a3d8d2c","subRelType":"provision","target":"10|opendoar____::dfa037a53e121ecc9e0926800c3e814e"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::b526b1aa1562038881a31be59896985f","subRelType":"provision","target":"10|re3data_____::2e457773b62df3534cc04441bf406a70"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::6b306183bc051b5aaa5376f2fab6e6e5","subRelType":"provision","target":"10|re3data_____::6371ff9ee1ec7073416cb83c868b10a3"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}
|
||||||
|
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1588608946167,"relClass":"provides","relType":"datasourceOrganization","source":"20|re3data_____::0f697c2543a43bc0da793bf78ecd4996","subRelType":"provision","target":"10|re3data_____::770ef1f8eb03f174c0add746523c6f28"}
|
|
@ -0,0 +1,10 @@
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681628"}
|
||||||
|
{"collectedfrom":null,"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-test","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"sysimport:dedup","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null},"lastupdatetimestamp":null,"relClass":"isMergedIn","relType":"resultResult","source":"50|dedup_wf_001::498c4e6cfff198831b488a6c62221241","subRelType":"dedup","target":"50|doiboost____::8e5e14d80d0f2ebe6a6a55d972681629"}
|
Loading…
Reference in New Issue