minor changes

This commit is contained in:
Michele De Bonis 2022-11-21 14:35:46 +01:00
parent 5aebe63f22
commit 42cff050e7
7 changed files with 98 additions and 79 deletions

View File

@ -3,7 +3,7 @@
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "datasource",
"orderField" : "officialname",
"orderField" : "englishname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
@ -16,6 +16,7 @@
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"decisionTree" : {
@ -80,9 +81,7 @@
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
],
"blacklists" : {
"legalname" : []
},
"blacklists" : {},
"synonyms": {}
}
}

View File

@ -9,7 +9,7 @@
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"slidingWindowSize": "100",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
@ -29,7 +29,7 @@
},
"pace": {
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
],
"decisionTree": {
@ -42,18 +42,36 @@
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 0.5,
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "hardCheck1",
"undefined": "hardCheck1",
"positive": "MATCH",
"negative": "versionCheck",
"undefined": "versionCheck",
"ignoreUndefined": "true"
},
"softCheck": {
"versionCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "titleCheck",
"negative": "NO_MATCH",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
@ -64,52 +82,28 @@
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"aggregation": "MAX",
"positive": "authorsCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"hardCheck1": {
"authorsCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "hardCheck2",
"negative": "NO_MATCH",
"undefined": "hardCheck2",
"ignoreUndefined": "false"
},
"hardCheck2": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model": [

View File

@ -0,0 +1,4 @@
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}

View File

@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override
protected String getValue(final Field f) {
try {
return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) {

View File

@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
}
else {
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if(useTree)
if (useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
}
}
@ -180,44 +183,45 @@ public class BlockProcessorForTesting {
return compare>=1.0;
}
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("mode", "count");
boolean startLayer = false;
boolean hardcheck1Layer = false;
double score = 0.0;
//START - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
startLayer = true;
//HARDCHECK1 - comparison of title versions and authors size
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0)
hardcheck1Layer = true;
//SOFTCHECK and HARDCHECK2 - comparison of the titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
double titleScore = Double.isNaN(result3)?0.0:result3;
if (startLayer) {
return titleScore >= 0.90;
}
else {
if (hardcheck1Layer) {
return titleScore >= 0.99;
}
}
return false;
//levenstein title
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
score += 0.2;
}
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
if(result1<0 || result1>=1.0) {
score += 0.1;
}
//authors match
params.remove("mode");
AuthorsMatch authorsMatch = new AuthorsMatch(params);
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
return score>=0.5;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {

View File

@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
}
@Test

View File

@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
}
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
}