test for softwares dedup added. definition of orp, dataset and sw dedup configurations

This commit is contained in:
miconis 2020-04-17 17:31:59 +02:00
parent 418cf94642
commit 6450bb0daa
8 changed files with 694 additions and 1 deletions

View File

@ -81,6 +81,10 @@ public class SparkDedupTest implements Serializable {
lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); .thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
.thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
} }
@Test @Test
@ -101,9 +105,11 @@ public class SparkDedupTest implements Serializable {
long orgs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel").count(); long orgs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel").count();
long pubs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel").count(); long pubs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel").count();
long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count();
assertEquals(3288, orgs_simrel); assertEquals(3288, orgs_simrel);
assertEquals(7260, pubs_simrel); assertEquals(7260, pubs_simrel);
assertEquals(344, sw_simrel);
} }
@Test @Test
@ -124,10 +130,11 @@ public class SparkDedupTest implements Serializable {
long orgs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").count(); long orgs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").count();
long pubs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel").count(); long pubs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel").count();
long sw_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel").count();
assertEquals(1244, orgs_mergerel); assertEquals(1244, orgs_mergerel);
assertEquals(1460, pubs_mergerel); assertEquals(1460, pubs_mergerel);
assertEquals(288, sw_mergerel);
} }
@Test @Test
@ -148,9 +155,11 @@ public class SparkDedupTest implements Serializable {
long orgs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord").count(); long orgs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord").count();
long pubs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord").count(); long pubs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord").count();
long sw_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord").count();
assertEquals(82, orgs_deduprecord); assertEquals(82, orgs_deduprecord);
assertEquals(66, pubs_deduprecord); assertEquals(66, pubs_deduprecord);
assertEquals(51, sw_deduprecord);
} }
@Test @Test
@ -171,6 +180,9 @@ public class SparkDedupTest implements Serializable {
long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count();
long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count();
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
long mergedOrgs = spark long mergedOrgs = spark
.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").as(Encoders.bean(Relation.class)) .read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").as(Encoders.bean(Relation.class))
@ -188,6 +200,9 @@ public class SparkDedupTest implements Serializable {
assertEquals(897, publications); assertEquals(897, publications);
assertEquals(835, organizations); assertEquals(835, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(200, softwares);
long deletedOrgs = jsc.textFile(testDedupGraphBasePath + "/organization") long deletedOrgs = jsc.textFile(testDedupGraphBasePath + "/organization")
.filter(this::isDeletedByInference).count(); .filter(this::isDeletedByInference).count();

View File

@ -0,0 +1,118 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "dataset",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree" : {
"start" : {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2" : {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3" : {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'].value"
},
{
"name" : "pid",
"type" : "JSON",
"path" : "$.pid",
"overrideMatch" : "true"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"length" : 250,
"size" : 5
},
{
"name" : "authors",
"type" : "String",
"path" : "$.author[*].fullname",
"size" : 200
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms" : {}
}
}

View File

@ -0,0 +1,118 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "otherresearchproduct",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree" : {
"start" : {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2" : {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3" : {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'}].value"
},
{
"name" : "pid",
"type" : "JSON",
"path" : "$.pid",
"overrideMatch" : "true"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"length" : 250,
"size" : 5
},
{
"name" : "authors",
"type" : "String",
"path" : "$.author[*].fullname",
"size" : 200
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms" : {}
}
}

View File

@ -0,0 +1,92 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "url",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "OR",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "false"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitleIgnoreVersion",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},
{
"name" : "url",
"type" : "String",
"path" : "$.instance.url"
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms": {}
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -14,6 +14,7 @@
<SCAN_SEQUENCE> <SCAN_SEQUENCE>
<SCAN id="organization"/> <SCAN id="organization"/>
<SCAN id="publication"/> <SCAN id="publication"/>
<SCAN id="software"/>
</SCAN_SEQUENCE> </SCAN_SEQUENCE>
</DEDUPLICATION> </DEDUPLICATION>
</CONFIGURATION> </CONFIGURATION>