minor changes
This commit is contained in:
parent
5aebe63f22
commit
42cff050e7
|
@ -3,7 +3,7 @@
|
|||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "datasource",
|
||||
"orderField" : "officialname",
|
||||
"orderField" : "englishname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
|
@ -16,6 +16,7 @@
|
|||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
|
||||
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||
],
|
||||
"decisionTree" : {
|
||||
|
@ -80,9 +81,7 @@
|
|||
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"blacklists" : {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
|
@ -9,7 +9,7 @@
|
|||
"queueMaxSize": "200",
|
||||
"groupMaxSize": "100",
|
||||
"maxChildren": "100",
|
||||
"slidingWindowSize": "50",
|
||||
"slidingWindowSize": "100",
|
||||
"rootBuilder": [
|
||||
"result",
|
||||
"resultProject_outcome_isProducedBy",
|
||||
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
"pace": {
|
||||
"clustering" : [
|
||||
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
|
||||
],
|
||||
"decisionTree": {
|
||||
|
@ -42,18 +42,36 @@
|
|||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
"jpath_classid": "$.qualifier.classid",
|
||||
"mode": "count"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "softCheck",
|
||||
"negative": "hardCheck1",
|
||||
"undefined": "hardCheck1",
|
||||
"positive": "MATCH",
|
||||
"negative": "versionCheck",
|
||||
"undefined": "versionCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"softCheck": {
|
||||
"versionCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"positive": "titleCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "titleCheck",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"titleCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
|
@ -64,52 +82,28 @@
|
|||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"aggregation": "MAX",
|
||||
"positive": "authorsCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"hardCheck1": {
|
||||
"authorsCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"comparator": "authorsMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "AND",
|
||||
"positive": "hardCheck2",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "hardCheck2",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"hardCheck2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"threshold": 0.6,
|
||||
"aggregation": "MAX",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
"undefined": "MATCH",
|
||||
"ignoreUndefined": "false"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
|
||||
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
|
||||
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
|
||||
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}
|
|
@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
|
|||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
|
||||
try {
|
||||
return asUrl(super.getValue(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
|
|
|
@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
|
|||
}
|
||||
else {
|
||||
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
||||
if(useTree)
|
||||
if (useTree)
|
||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
else
|
||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||
}
|
||||
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
||||
// emitOutput(true, idPivot, idCurr, context);
|
||||
// }
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -180,44 +183,45 @@ public class BlockProcessorForTesting {
|
|||
return compare>=1.0;
|
||||
}
|
||||
|
||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||
//if the score gives 1, the publications are equivalent
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
params.put("mode", "count");
|
||||
|
||||
boolean startLayer = false;
|
||||
boolean hardcheck1Layer = false;
|
||||
double score = 0.0;
|
||||
|
||||
//START - comparison of the PIDs json lists
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
||||
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
||||
startLayer = true;
|
||||
|
||||
//HARDCHECK1 - comparison of title versions and authors size
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
SizeMatch sizeMatch = new SizeMatch(params);
|
||||
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||
if (Math.min(result1, result2) != 0)
|
||||
hardcheck1Layer = true;
|
||||
|
||||
//SOFTCHECK and HARDCHECK2 - comparison of the titles
|
||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
double titleScore = Double.isNaN(result3)?0.0:result3;
|
||||
|
||||
if (startLayer) {
|
||||
return titleScore >= 0.90;
|
||||
}
|
||||
else {
|
||||
if (hardcheck1Layer) {
|
||||
return titleScore >= 0.99;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
//levenstein title
|
||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
//pid
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
||||
score += 0.5;
|
||||
}
|
||||
|
||||
//title version
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
if(result1<0 || result1>=1.0) {
|
||||
score += 0.1;
|
||||
}
|
||||
|
||||
//authors match
|
||||
params.remove("mode");
|
||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||
if(result2 <0|| result2>=0.6) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
return score>=0.5;
|
||||
}
|
||||
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
|
|
|
@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
params.put("len", 3);
|
||||
params.put("max", 1);
|
||||
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void domainExactMatch() {
|
||||
|
||||
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
||||
Field a = url("http://www.flowrepository.org");
|
||||
Field b = url("http://flowrepository.org/");
|
||||
|
||||
double compare = domainExactMatch.compare(a, b, conf);
|
||||
System.out.println("compare = " + compare);
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue