spark dedup test fixed, sample for dataset and orp added, test implemented

This commit is contained in:
miconis 2020-04-23 18:16:20 +02:00
parent 8851050814
commit 8d258c85ff
7 changed files with 822 additions and 12 deletions

View File

@ -57,7 +57,6 @@ public class SparkDedupTest implements Serializable {
.toURI())
.toFile()
.getAbsolutePath();
testOutputBasePath =
createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
.toAbsolutePath()
@ -110,6 +109,20 @@ public class SparkDedupTest implements Serializable {
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
.thenReturn(
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
.thenReturn(
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
}
@Test
@ -144,9 +157,21 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
.count();
long ds_simrel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
.count();
long orp_simrel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count();
assertEquals(3432, orgs_simrel);
assertEquals(7260, pubs_simrel);
assertEquals(344, sw_simrel);
assertEquals(458, ds_simrel);
assertEquals(6740, orp_simrel);
}
@Test
@ -181,9 +206,21 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
.count();
long ds_mergerel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
.count();
long orp_mergerel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count();
assertEquals(1276, orgs_mergerel);
assertEquals(1460, pubs_mergerel);
assertEquals(288, sw_mergerel);
assertEquals(472, ds_mergerel);
assertEquals(714, orp_mergerel);
}
@Test
@ -217,15 +254,22 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath
+ "/"
+ testActionSetId
+ "/publication_deduprecord")
.count();
+ "/publication_deduprecord").count();
long sw_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
.count();
long ds_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord")
.count();
long orp_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count();
assertEquals(82, orgs_deduprecord);
assertEquals(66, pubs_deduprecord);
assertEquals(51, sw_deduprecord);
assertEquals(96, ds_deduprecord);
assertEquals(89, orp_deduprecord);
}
@Test
@ -251,6 +295,8 @@ public class SparkDedupTest implements Serializable {
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count();
long otherresearchproduct = jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count();
long mergedOrgs =
spark.read()
@ -282,11 +328,33 @@ public class SparkDedupTest implements Serializable {
.distinct()
.count();
long mergedDs =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
long mergedOrp =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
assertEquals(897, publications);
assertEquals(835, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(200, softwares);
assertEquals(388, dataset);
assertEquals(517, otherresearchproduct);
long deletedOrgs =
jsc.textFile(testDedupGraphBasePath + "/organization")
@ -303,9 +371,21 @@ public class SparkDedupTest implements Serializable {
.filter(this::isDeletedByInference)
.count();
long deletedDs =
jsc.textFile(testDedupGraphBasePath + "/dataset")
.filter(this::isDeletedByInference)
.count();
long deletedOrp =
jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct")
.filter(this::isDeletedByInference)
.count();
assertEquals(mergedOrgs, deletedOrgs);
assertEquals(mergedPubs, deletedPubs);
assertEquals(mergedSw, deletedSw);
assertEquals(mergedDs, deletedDs);
assertEquals(mergedOrp, deletedOrp);
}
@Test

View File

@ -11,7 +11,9 @@
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
"includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
},
"pace" : {
"clustering" : [
@ -70,7 +72,8 @@
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
@ -85,7 +88,7 @@
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'].value"
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "pid",
@ -96,7 +99,7 @@
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},

View File

@ -11,7 +11,9 @@
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
"includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
},
"pace" : {
"clustering" : [
@ -70,7 +72,8 @@
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
@ -85,7 +88,7 @@
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'}].value"
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "pid",
@ -96,7 +99,7 @@
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},

View File

@ -11,7 +11,9 @@
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
"includeChildren" : "true",
"idPath" : "$.id",
"maxIterations" : 20
},
"pace" : {
"clustering" : [

File diff suppressed because one or more lines are too long

View File

@ -15,6 +15,8 @@
<SCAN id="organization"/>
<SCAN id="publication"/>
<SCAN id="software"/>
<SCAN id="dataset"/>
<SCAN id="otherresearchproduct"/>
</SCAN_SEQUENCE>
</DEDUPLICATION>
</CONFIGURATION>