spark dedup test fixed, sample for dataset and orp added, test implemented
This commit is contained in:
parent
8851050814
commit
8d258c85ff
|
@ -57,7 +57,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
.toURI())
|
.toURI())
|
||||||
.toFile()
|
.toFile()
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
testOutputBasePath =
|
testOutputBasePath =
|
||||||
createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
|
createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
|
||||||
.toAbsolutePath()
|
.toAbsolutePath()
|
||||||
|
@ -110,6 +109,20 @@ public class SparkDedupTest implements Serializable {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkDedupTest.class.getResourceAsStream(
|
SparkDedupTest.class.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
|
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
|
||||||
|
.thenReturn(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkDedupTest.class.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
|
||||||
|
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
|
||||||
|
.thenReturn(
|
||||||
|
IOUtils.toString(
|
||||||
|
SparkDedupTest.class.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -144,9 +157,21 @@ public class SparkDedupTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
|
long ds_simrel =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long orp_simrel =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||||
|
.count();
|
||||||
|
|
||||||
assertEquals(3432, orgs_simrel);
|
assertEquals(3432, orgs_simrel);
|
||||||
assertEquals(7260, pubs_simrel);
|
assertEquals(7260, pubs_simrel);
|
||||||
assertEquals(344, sw_simrel);
|
assertEquals(344, sw_simrel);
|
||||||
|
assertEquals(458, ds_simrel);
|
||||||
|
assertEquals(6740, orp_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -181,9 +206,21 @@ public class SparkDedupTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
|
long ds_mergerel =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long orp_mergerel =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
|
.count();
|
||||||
|
|
||||||
assertEquals(1276, orgs_mergerel);
|
assertEquals(1276, orgs_mergerel);
|
||||||
assertEquals(1460, pubs_mergerel);
|
assertEquals(1460, pubs_mergerel);
|
||||||
assertEquals(288, sw_mergerel);
|
assertEquals(288, sw_mergerel);
|
||||||
|
assertEquals(472, ds_mergerel);
|
||||||
|
assertEquals(714, orp_mergerel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -217,15 +254,22 @@ public class SparkDedupTest implements Serializable {
|
||||||
testOutputBasePath
|
testOutputBasePath
|
||||||
+ "/"
|
+ "/"
|
||||||
+ testActionSetId
|
+ testActionSetId
|
||||||
+ "/publication_deduprecord")
|
+ "/publication_deduprecord").count();
|
||||||
.count();
|
|
||||||
long sw_deduprecord =
|
long sw_deduprecord =
|
||||||
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
|
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
|
||||||
.count();
|
.count();
|
||||||
|
long ds_deduprecord =
|
||||||
|
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord")
|
||||||
|
.count();
|
||||||
|
long orp_deduprecord =
|
||||||
|
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||||
|
.count();
|
||||||
|
|
||||||
assertEquals(82, orgs_deduprecord);
|
assertEquals(82, orgs_deduprecord);
|
||||||
assertEquals(66, pubs_deduprecord);
|
assertEquals(66, pubs_deduprecord);
|
||||||
assertEquals(51, sw_deduprecord);
|
assertEquals(51, sw_deduprecord);
|
||||||
|
assertEquals(96, ds_deduprecord);
|
||||||
|
assertEquals(89, orp_deduprecord);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -251,6 +295,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
|
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
|
||||||
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
|
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
|
||||||
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
|
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
|
||||||
|
long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count();
|
||||||
|
long otherresearchproduct = jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count();
|
||||||
|
|
||||||
long mergedOrgs =
|
long mergedOrgs =
|
||||||
spark.read()
|
spark.read()
|
||||||
|
@ -282,11 +328,33 @@ public class SparkDedupTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
|
long mergedDs =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.where("relClass=='merges'")
|
||||||
|
.javaRDD()
|
||||||
|
.map(Relation::getTarget)
|
||||||
|
.distinct()
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long mergedOrp =
|
||||||
|
spark.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class))
|
||||||
|
.where("relClass=='merges'")
|
||||||
|
.javaRDD()
|
||||||
|
.map(Relation::getTarget)
|
||||||
|
.distinct()
|
||||||
|
.count();
|
||||||
|
|
||||||
assertEquals(897, publications);
|
assertEquals(897, publications);
|
||||||
assertEquals(835, organizations);
|
assertEquals(835, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(200, softwares);
|
assertEquals(200, softwares);
|
||||||
|
assertEquals(388, dataset);
|
||||||
|
assertEquals(517, otherresearchproduct);
|
||||||
|
|
||||||
long deletedOrgs =
|
long deletedOrgs =
|
||||||
jsc.textFile(testDedupGraphBasePath + "/organization")
|
jsc.textFile(testDedupGraphBasePath + "/organization")
|
||||||
|
@ -303,9 +371,21 @@ public class SparkDedupTest implements Serializable {
|
||||||
.filter(this::isDeletedByInference)
|
.filter(this::isDeletedByInference)
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
|
long deletedDs =
|
||||||
|
jsc.textFile(testDedupGraphBasePath + "/dataset")
|
||||||
|
.filter(this::isDeletedByInference)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long deletedOrp =
|
||||||
|
jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct")
|
||||||
|
.filter(this::isDeletedByInference)
|
||||||
|
.count();
|
||||||
|
|
||||||
assertEquals(mergedOrgs, deletedOrgs);
|
assertEquals(mergedOrgs, deletedOrgs);
|
||||||
assertEquals(mergedPubs, deletedPubs);
|
assertEquals(mergedPubs, deletedPubs);
|
||||||
assertEquals(mergedSw, deletedSw);
|
assertEquals(mergedSw, deletedSw);
|
||||||
|
assertEquals(mergedDs, deletedDs);
|
||||||
|
assertEquals(mergedOrp, deletedOrp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -11,7 +11,9 @@
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true",
|
||||||
|
"idPath" : "$.id",
|
||||||
|
"maxIterations" : 20
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
@ -70,7 +72,8 @@
|
||||||
"field": "title",
|
"field": "title",
|
||||||
"comparator": "levensteinTitle",
|
"comparator": "levensteinTitle",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true"
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.99,
|
"threshold": 0.99,
|
||||||
|
@ -85,7 +88,7 @@
|
||||||
{
|
{
|
||||||
"name" : "doi",
|
"name" : "doi",
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.pid[@.qualifier.classid = 'doi'].value"
|
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name" : "pid",
|
"name" : "pid",
|
||||||
|
@ -96,7 +99,7 @@
|
||||||
{
|
{
|
||||||
"name" : "title",
|
"name" : "title",
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.title[@.qualifier.classid = 'main title'].value",
|
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length" : 250,
|
"length" : 250,
|
||||||
"size" : 5
|
"size" : 5
|
||||||
},
|
},
|
||||||
|
|
|
@ -11,7 +11,9 @@
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true",
|
||||||
|
"idPath" : "$.id",
|
||||||
|
"maxIterations" : 20
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
@ -70,7 +72,8 @@
|
||||||
"field": "title",
|
"field": "title",
|
||||||
"comparator": "levensteinTitle",
|
"comparator": "levensteinTitle",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true"
|
"countIfUndefined": "true",
|
||||||
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.99,
|
"threshold": 0.99,
|
||||||
|
@ -85,7 +88,7 @@
|
||||||
{
|
{
|
||||||
"name" : "doi",
|
"name" : "doi",
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.pid[@.qualifier.classid = 'doi'}].value"
|
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name" : "pid",
|
"name" : "pid",
|
||||||
|
@ -96,7 +99,7 @@
|
||||||
{
|
{
|
||||||
"name" : "title",
|
"name" : "title",
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.title[@.qualifier.classid = 'main title'].value",
|
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length" : 250,
|
"length" : 250,
|
||||||
"size" : 5
|
"size" : 5
|
||||||
},
|
},
|
||||||
|
|
|
@ -11,7 +11,9 @@
|
||||||
"maxChildren" : "100",
|
"maxChildren" : "100",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true",
|
||||||
|
"idPath" : "$.id",
|
||||||
|
"maxIterations" : 20
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -15,6 +15,8 @@
|
||||||
<SCAN id="organization"/>
|
<SCAN id="organization"/>
|
||||||
<SCAN id="publication"/>
|
<SCAN id="publication"/>
|
||||||
<SCAN id="software"/>
|
<SCAN id="software"/>
|
||||||
|
<SCAN id="dataset"/>
|
||||||
|
<SCAN id="otherresearchproduct"/>
|
||||||
</SCAN_SEQUENCE>
|
</SCAN_SEQUENCE>
|
||||||
</DEDUPLICATION>
|
</DEDUPLICATION>
|
||||||
</CONFIGURATION>
|
</CONFIGURATION>
|
||||||
|
|
Loading…
Reference in New Issue