minor changes

This commit is contained in:
Michele De Bonis 2022-11-21 14:35:46 +01:00
parent 5aebe63f22
commit 42cff050e7
7 changed files with 98 additions and 79 deletions

View File

@ -3,7 +3,7 @@
"threshold" : "0.99", "threshold" : "0.99",
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "datasource", "entityType" : "datasource",
"orderField" : "officialname", "orderField" : "englishname",
"queueMaxSize" : "2000", "queueMaxSize" : "2000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
@ -16,6 +16,7 @@
"clustering" : [ "clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} }, { "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } }, { "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
], ],
"decisionTree" : { "decisionTree" : {
@ -80,9 +81,7 @@
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" }, { "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" } { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
], ],
"blacklists" : { "blacklists" : {},
"legalname" : []
},
"synonyms": {} "synonyms": {}
} }
} }

View File

@ -9,7 +9,7 @@
"queueMaxSize": "200", "queueMaxSize": "200",
"groupMaxSize": "100", "groupMaxSize": "100",
"maxChildren": "100", "maxChildren": "100",
"slidingWindowSize": "50", "slidingWindowSize": "100",
"rootBuilder": [ "rootBuilder": [
"result", "result",
"resultProject_outcome_isProducedBy", "resultProject_outcome_isProducedBy",
@ -29,7 +29,7 @@
}, },
"pace": { "pace": {
"clustering" : [ "clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } }, { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} } { "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
], ],
"decisionTree": { "decisionTree": {
@ -42,18 +42,36 @@
"countIfUndefined": "false", "countIfUndefined": "false",
"params": { "params": {
"jpath_value": "$.value", "jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid" "jpath_classid": "$.qualifier.classid",
"mode": "count"
} }
} }
], ],
"threshold": 0.5, "threshold": 1.0,
"aggregation": "MAX", "aggregation": "MAX",
"positive": "softCheck", "positive": "MATCH",
"negative": "hardCheck1", "negative": "versionCheck",
"undefined": "hardCheck1", "undefined": "versionCheck",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"softCheck": { "versionCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "titleCheck",
"negative": "NO_MATCH",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [ "fields": [
{ {
"field": "title", "field": "title",
@ -64,52 +82,28 @@
} }
], ],
"threshold": 0.9, "threshold": 0.9,
"aggregation": "AVG", "aggregation": "MAX",
"positive": "MATCH", "positive": "authorsCheck",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "NO_MATCH",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"hardCheck1": { "authorsCheck": {
"fields": [ "fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{ {
"field": "authors", "field": "authors",
"comparator": "sizeMatch", "comparator": "authorsMatch",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": {} "params": {}
} }
], ],
"threshold": 1.0, "threshold": 0.6,
"aggregation": "AND", "aggregation": "MAX",
"positive": "hardCheck2",
"negative": "NO_MATCH",
"undefined": "hardCheck2",
"ignoreUndefined": "false"
},
"hardCheck2": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "MATCH",
"ignoreUndefined": "true" "ignoreUndefined": "false"
} }
}, },
"model": [ "model": [

View File

@ -0,0 +1,4 @@
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}

View File

@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override @Override
protected String getValue(final Field f) { protected String getValue(final Field f) {
try { try {
return asUrl(super.getValue(f)).getHost(); return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) { } catch (MalformedURLException e) {

View File

@ -166,6 +166,9 @@ public class BlockProcessorForTesting {
else else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
} }
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
} }
} }
@ -181,41 +184,42 @@ public class BlockProcessorForTesting {
} }
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
boolean startLayer = false;
boolean hardcheck1Layer = false;
//START - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>(); Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params); params.put("mode", "count");
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
startLayer = true;
//HARDCHECK1 - comparison of title versions and authors size double score = 0.0;
//levenstein title
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
score += 0.2;
}
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params); if(result1<0 || result1>=1.0) {
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); score += 0.1;
if (Math.min(result1, result2) != 0) }
hardcheck1Layer = true;
//SOFTCHECK and HARDCHECK2 - comparison of the titles //authors match
LevensteinTitle levensteinTitle = new LevensteinTitle(params); params.remove("mode");
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); AuthorsMatch authorsMatch = new AuthorsMatch(params);
double titleScore = Double.isNaN(result3)?0.0:result3; double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
if (startLayer) { return score>=0.5;
return titleScore >= 0.90;
}
else {
if (hardcheck1Layer) {
return titleScore >= 0.99;
}
}
return false;
} }
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {

View File

@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
} }
@Test @Test

View File

@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result); assertEquals(1.0, result);
} }
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
} }