spark dedup test fixed, sample for dataset and orp added, test implemented

This commit is contained in:
miconis 2020-04-23 18:16:20 +02:00
parent 8851050814
commit 8d258c85ff
7 changed files with 822 additions and 12 deletions

View File

@ -57,7 +57,6 @@ public class SparkDedupTest implements Serializable {
.toURI()) .toURI())
.toFile() .toFile()
.getAbsolutePath(); .getAbsolutePath();
testOutputBasePath = testOutputBasePath =
createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
.toAbsolutePath() .toAbsolutePath()
@ -110,6 +109,20 @@ public class SparkDedupTest implements Serializable {
IOUtils.toString( IOUtils.toString(
SparkDedupTest.class.getResourceAsStream( SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
.thenReturn(
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
.thenReturn(
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
} }
@Test @Test
@ -144,9 +157,21 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
.count(); .count();
long ds_simrel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
.count();
long orp_simrel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count();
assertEquals(3432, orgs_simrel); assertEquals(3432, orgs_simrel);
assertEquals(7260, pubs_simrel); assertEquals(7260, pubs_simrel);
assertEquals(344, sw_simrel); assertEquals(344, sw_simrel);
assertEquals(458, ds_simrel);
assertEquals(6740, orp_simrel);
} }
@Test @Test
@ -181,9 +206,21 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
.count(); .count();
long ds_mergerel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
.count();
long orp_mergerel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count();
assertEquals(1276, orgs_mergerel); assertEquals(1276, orgs_mergerel);
assertEquals(1460, pubs_mergerel); assertEquals(1460, pubs_mergerel);
assertEquals(288, sw_mergerel); assertEquals(288, sw_mergerel);
assertEquals(472, ds_mergerel);
assertEquals(714, orp_mergerel);
} }
@Test @Test
@ -217,15 +254,22 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath testOutputBasePath
+ "/" + "/"
+ testActionSetId + testActionSetId
+ "/publication_deduprecord") + "/publication_deduprecord").count();
.count();
long sw_deduprecord = long sw_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord") jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
.count(); .count();
long ds_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord")
.count();
long orp_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count();
assertEquals(82, orgs_deduprecord); assertEquals(82, orgs_deduprecord);
assertEquals(66, pubs_deduprecord); assertEquals(66, pubs_deduprecord);
assertEquals(51, sw_deduprecord); assertEquals(51, sw_deduprecord);
assertEquals(96, ds_deduprecord);
assertEquals(89, orp_deduprecord);
} }
@Test @Test
@ -251,6 +295,8 @@ public class SparkDedupTest implements Serializable {
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count();
long otherresearchproduct = jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count();
long mergedOrgs = long mergedOrgs =
spark.read() spark.read()
@ -282,11 +328,33 @@ public class SparkDedupTest implements Serializable {
.distinct() .distinct()
.count(); .count();
long mergedDs =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
long mergedOrp =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
assertEquals(897, publications); assertEquals(897, publications);
assertEquals(835, organizations); assertEquals(835, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
assertEquals(200, softwares); assertEquals(200, softwares);
assertEquals(388, dataset);
assertEquals(517, otherresearchproduct);
long deletedOrgs = long deletedOrgs =
jsc.textFile(testDedupGraphBasePath + "/organization") jsc.textFile(testDedupGraphBasePath + "/organization")
@ -303,9 +371,21 @@ public class SparkDedupTest implements Serializable {
.filter(this::isDeletedByInference) .filter(this::isDeletedByInference)
.count(); .count();
long deletedDs =
jsc.textFile(testDedupGraphBasePath + "/dataset")
.filter(this::isDeletedByInference)
.count();
long deletedOrp =
jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct")
.filter(this::isDeletedByInference)
.count();
assertEquals(mergedOrgs, deletedOrgs); assertEquals(mergedOrgs, deletedOrgs);
assertEquals(mergedPubs, deletedPubs); assertEquals(mergedPubs, deletedPubs);
assertEquals(mergedSw, deletedSw); assertEquals(mergedSw, deletedSw);
assertEquals(mergedDs, deletedDs);
assertEquals(mergedOrp, deletedOrp);
} }
@Test @Test

View File

@ -11,7 +11,9 @@
"maxChildren" : "100", "maxChildren" : "100",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true" "includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
@ -70,7 +72,8 @@
"field": "title", "field": "title",
"comparator": "levensteinTitle", "comparator": "levensteinTitle",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true" "countIfUndefined": "true",
"params": {}
} }
], ],
"threshold": 0.99, "threshold": 0.99,
@ -85,7 +88,7 @@
{ {
"name" : "doi", "name" : "doi",
"type" : "String", "type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'].value" "path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
}, },
{ {
"name" : "pid", "name" : "pid",
@ -96,7 +99,7 @@
{ {
"name" : "title", "name" : "title",
"type" : "String", "type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value", "path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250, "length" : 250,
"size" : 5 "size" : 5
}, },

View File

@ -11,7 +11,9 @@
"maxChildren" : "100", "maxChildren" : "100",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true" "includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
@ -70,7 +72,8 @@
"field": "title", "field": "title",
"comparator": "levensteinTitle", "comparator": "levensteinTitle",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true" "countIfUndefined": "true",
"params": {}
} }
], ],
"threshold": 0.99, "threshold": 0.99,
@ -85,7 +88,7 @@
{ {
"name" : "doi", "name" : "doi",
"type" : "String", "type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'}].value" "path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
}, },
{ {
"name" : "pid", "name" : "pid",
@ -96,7 +99,7 @@
{ {
"name" : "title", "name" : "title",
"type" : "String", "type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value", "path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250, "length" : 250,
"size" : 5 "size" : 5
}, },

View File

@ -11,7 +11,9 @@
"maxChildren" : "100", "maxChildren" : "100",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true" "includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [

File diff suppressed because one or more lines are too long

View File

@ -15,6 +15,8 @@
<SCAN id="organization"/> <SCAN id="organization"/>
<SCAN id="publication"/> <SCAN id="publication"/>
<SCAN id="software"/> <SCAN id="software"/>
<SCAN id="dataset"/>
<SCAN id="otherresearchproduct"/>
</SCAN_SEQUENCE> </SCAN_SEQUENCE>
</DEDUPLICATION> </DEDUPLICATION>
</CONFIGURATION> </CONFIGURATION>