minor changes
This commit is contained in:
parent
5aebe63f22
commit
42cff050e7
|
@ -3,7 +3,7 @@
|
||||||
"threshold" : "0.99",
|
"threshold" : "0.99",
|
||||||
"dedupRun" : "001",
|
"dedupRun" : "001",
|
||||||
"entityType" : "datasource",
|
"entityType" : "datasource",
|
||||||
"orderField" : "officialname",
|
"orderField" : "englishname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "2000",
|
||||||
"groupMaxSize" : "50",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
|
@ -16,6 +16,7 @@
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
|
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
|
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
|
||||||
|
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
|
||||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||||
],
|
],
|
||||||
"decisionTree" : {
|
"decisionTree" : {
|
||||||
|
@ -80,9 +81,7 @@
|
||||||
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
|
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
|
||||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
|
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
|
||||||
],
|
],
|
||||||
"blacklists" : {
|
"blacklists" : {},
|
||||||
"legalname" : []
|
|
||||||
},
|
|
||||||
"synonyms": {}
|
"synonyms": {}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -9,7 +9,7 @@
|
||||||
"queueMaxSize": "200",
|
"queueMaxSize": "200",
|
||||||
"groupMaxSize": "100",
|
"groupMaxSize": "100",
|
||||||
"maxChildren": "100",
|
"maxChildren": "100",
|
||||||
"slidingWindowSize": "50",
|
"slidingWindowSize": "100",
|
||||||
"rootBuilder": [
|
"rootBuilder": [
|
||||||
"result",
|
"result",
|
||||||
"resultProject_outcome_isProducedBy",
|
"resultProject_outcome_isProducedBy",
|
||||||
|
@ -29,7 +29,7 @@
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
|
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
|
@ -42,18 +42,36 @@
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {
|
"params": {
|
||||||
"jpath_value": "$.value",
|
"jpath_value": "$.value",
|
||||||
"jpath_classid": "$.qualifier.classid"
|
"jpath_classid": "$.qualifier.classid",
|
||||||
|
"mode": "count"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.5,
|
"threshold": 1.0,
|
||||||
"aggregation": "MAX",
|
"aggregation": "MAX",
|
||||||
"positive": "softCheck",
|
"positive": "MATCH",
|
||||||
"negative": "hardCheck1",
|
"negative": "versionCheck",
|
||||||
"undefined": "hardCheck1",
|
"undefined": "versionCheck",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"softCheck": {
|
"versionCheck": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "title",
|
||||||
|
"comparator": "titleVersionMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 1.0,
|
||||||
|
"aggregation": "MAX",
|
||||||
|
"positive": "titleCheck",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "titleCheck",
|
||||||
|
"ignoreUndefined": "false"
|
||||||
|
},
|
||||||
|
"titleCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "title",
|
"field": "title",
|
||||||
|
@ -64,52 +82,28 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.9,
|
"threshold": 0.9,
|
||||||
"aggregation": "AVG",
|
"aggregation": "MAX",
|
||||||
"positive": "MATCH",
|
"positive": "authorsCheck",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "NO_MATCH",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "true"
|
||||||
},
|
},
|
||||||
"hardCheck1": {
|
"authorsCheck": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
|
||||||
"field": "title",
|
|
||||||
"comparator": "titleVersionMatch",
|
|
||||||
"weight": 1.0,
|
|
||||||
"countIfUndefined": "false",
|
|
||||||
"params": {}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"field": "authors",
|
"field": "authors",
|
||||||
"comparator": "sizeMatch",
|
"comparator": "authorsMatch",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1.0,
|
"threshold": 0.6,
|
||||||
"aggregation": "AND",
|
"aggregation": "MAX",
|
||||||
"positive": "hardCheck2",
|
|
||||||
"negative": "NO_MATCH",
|
|
||||||
"undefined": "hardCheck2",
|
|
||||||
"ignoreUndefined": "false"
|
|
||||||
},
|
|
||||||
"hardCheck2": {
|
|
||||||
"fields": [
|
|
||||||
{
|
|
||||||
"field": "title",
|
|
||||||
"comparator": "levensteinTitle",
|
|
||||||
"weight": 1.0,
|
|
||||||
"countIfUndefined": "true",
|
|
||||||
"params": {}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"threshold": 0.99,
|
|
||||||
"aggregation": "AVG",
|
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "MATCH",
|
||||||
"ignoreUndefined": "true"
|
"ignoreUndefined": "false"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model": [
|
"model": [
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
|
||||||
|
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
|
||||||
|
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
|
||||||
|
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}
|
|
@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getValue(final Field f) {
|
protected String getValue(final Field f) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return asUrl(super.getValue(f)).getHost();
|
return asUrl(super.getValue(f)).getHost();
|
||||||
} catch (MalformedURLException e) {
|
} catch (MalformedURLException e) {
|
||||||
|
|
|
@ -166,6 +166,9 @@ public class BlockProcessorForTesting {
|
||||||
else
|
else
|
||||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||||
}
|
}
|
||||||
|
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
||||||
|
// emitOutput(true, idPivot, idCurr, context);
|
||||||
|
// }
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -181,41 +184,42 @@ public class BlockProcessorForTesting {
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||||
|
//if the score gives 1, the publications are equivalent
|
||||||
boolean startLayer = false;
|
|
||||||
boolean hardcheck1Layer = false;
|
|
||||||
|
|
||||||
//START - comparison of the PIDs json lists
|
|
||||||
Map<String, String> params = new HashMap<>();
|
Map<String, String> params = new HashMap<>();
|
||||||
params.put("jpath_value", "$.value");
|
params.put("jpath_value", "$.value");
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
params.put("mode", "count");
|
||||||
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
|
|
||||||
if (result >= 0.5) //if the result of the comparison is greater than the threshold
|
|
||||||
startLayer = true;
|
|
||||||
|
|
||||||
//HARDCHECK1 - comparison of title versions and authors size
|
double score = 0.0;
|
||||||
|
|
||||||
|
//levenstein title
|
||||||
|
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||||
|
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
||||||
|
score += 0.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
//pid
|
||||||
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||||
|
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
||||||
|
score += 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
//title version
|
||||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||||
SizeMatch sizeMatch = new SizeMatch(params);
|
if(result1<0 || result1>=1.0) {
|
||||||
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
score += 0.1;
|
||||||
if (Math.min(result1, result2) != 0)
|
}
|
||||||
hardcheck1Layer = true;
|
|
||||||
|
|
||||||
//SOFTCHECK and HARDCHECK2 - comparison of the titles
|
//authors match
|
||||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
params.remove("mode");
|
||||||
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||||
double titleScore = Double.isNaN(result3)?0.0:result3;
|
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||||
|
if(result2 <0|| result2>=0.6) {
|
||||||
|
score += 0.2;
|
||||||
|
}
|
||||||
|
|
||||||
if (startLayer) {
|
return score>=0.5;
|
||||||
return titleScore >= 0.90;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (hardcheck1Layer) {
|
|
||||||
return titleScore >= 0.99;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||||
|
|
|
@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
|
|
||||||
|
params.put("len", 3);
|
||||||
|
params.put("max", 1);
|
||||||
|
|
||||||
|
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -272,5 +272,17 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void domainExactMatch() {
|
||||||
|
|
||||||
|
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
||||||
|
Field a = url("http://www.flowrepository.org");
|
||||||
|
Field b = url("http://flowrepository.org/");
|
||||||
|
|
||||||
|
double compare = domainExactMatch.compare(a, b, conf);
|
||||||
|
System.out.println("compare = " + compare);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue