minor changes in comparators

This commit is contained in:
miconis 2020-01-24 10:01:11 +01:00
parent cc86591fad
commit eeeb374480
11 changed files with 536 additions and 89 deletions

File diff suppressed because one or more lines are too long

View File

@ -88,7 +88,7 @@
}
],
"threshold": 0.1,
"aggregation": "W_MEAN",
"aggregation": "AVG",
"positive": "layer4",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -143,18 +143,18 @@
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"},
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],

View File

@ -28,34 +28,10 @@
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree": {
"start": {
@ -72,7 +48,7 @@
}
],
"threshold": 0.5,
"aggregation": "MAX",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
@ -96,7 +72,7 @@
}
],
"threshold": 1.0,
"aggregation": "NC",
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
@ -113,7 +89,7 @@
}
],
"threshold": 0.99,
"aggregation": "SUM",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -129,7 +105,7 @@
{
"name": "pid",
"type": "JSON",
"path": "$.pid[*]",
"path": "$.pid",
"overrideMatch": "true"
},
{
@ -142,7 +118,7 @@
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname[*]",
"path": "$.author[*].fullname",
"size": 200
},
{

View File

@ -0,0 +1,230 @@
{
"journal": {
"name": "",
"issnPrinted": "",
"issnOnline": "",
"issnLinking": "",
"ep": "",
"iss": "",
"sp": "",
"vol": "",
"edition": "",
"conferenceplace": "",
"conferencedate": "",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
"author": [
{
"fullname": "Osti Guerrazzi, Amedeo",
"name": "Amedeo",
"surname": "Osti Guerrazzi",
"rank": 1,
"pid": [],
"affiliation": []
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"language": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"country": [],
"subject": [],
"title": [
{
"value": "Rezension: Gabriele Rigano: L\u2019interprete di Auschwitz. Arminio Wachsberger un testimone d\u2019eccezione della deportazione degli ebrei di Roma (rezensiert von Amedeo Osti Guerrazzi)",
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
}
],
"relevantdate": [],
"description": [],
"dateofacceptance": {
"value": "2018-01-01",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
"publisher": {
"value": "BSB - Bavarian State Library",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
"embargoenddate": {
"value": "",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
"source": [],
"fulltext": [],
"format": [],
"contributor": [],
"resourcetype": {
"classid": "RezensionReview",
"classname": "RezensionReview",
"schemeid": "dnet:dataCite_resource",
"schemename": "dnet:dataCite_resource"
},
"coverage": [],
"refereed": {
"value": "",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
"context": [],
"id": "50|datacite____::e23cad995f89bf6cfe63dcc83d41cce9",
"originalId": [
"http://dx.doi.org/10.15463/rec.2071701652",
"10.15463/rec.2071701652",
"https://doi.org/10.15463/rec.2071701652"
],
"collectedfrom": [
{
"key": "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254",
"value": "Datacite",
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
}
],
"pid": [
{
"value": "https://doi.org/10.15463/rec.2071701652",
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
},
{
"value": "10.15463/rec.2071701652",
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
}
}
}
],
"extraInfo": [],
"dataInfo": {
"invisible": false,
"inferred": false,
"deletedbyinference": false,
"inferenceprovenance": "",
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
}
},
"lastupdatetimestamp": 0
}

View File

@ -17,6 +17,7 @@ import java.io.IOException;
import java.io.StringWriter;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions {
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
protected String concat(final List<String> l) {
@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s0 = s.toLowerCase();
final String s0 = unicodeNormalization(s.toLowerCase());
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions {
}
protected String normalize(final String s) {
return nfd(s)
return nfd(unicodeNormalization(s))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();

View File

@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator {
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no keywords
return -1.0; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}

View File

@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable {
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")),
result,
fieldConf.isCountIfUndefined(),
doc1.getFieldMap().get(fieldConf.getField()),

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before;
import org.junit.Test;
@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
System.out.println(utils.cleanupForOrdering("University of Pisa"));
}
@Test
@ -111,10 +113,15 @@ public class ComparatorTest extends AbstractPaceFunctions {
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
}
@Test
public void jsonListMatchTest() {
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
}
@Test
public void jsonListMatchTest(){
}

View File

@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull;
public class ConfigTest extends AbstractPaceTest {
private Map<String, String> params;
@Before
public void setup() {
params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
}
@Test
public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest {
@Test
public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
final String json = readFromClasspath("organization.json");
final String json = readFromClasspath("publication.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap());
JsonListMatch jsonListMatch = new JsonListMatch(params);
jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
}
@Test

View File

@ -25,37 +25,13 @@
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.entity.id"
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree": {
"start": {
@ -66,14 +42,13 @@
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"threshold": "0.5",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
@ -97,7 +72,7 @@
}
],
"threshold": 1.0,
"aggregation": "NC",
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
@ -107,14 +82,14 @@
"fields": [
{
"field": "title",
"comparator": "LevensteinTitle",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "SUM",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -130,7 +105,7 @@
{
"name": "pid",
"type": "JSON",
"path": "$.pid[*]",
"path": "$.pid",
"overrideMatch": "true"
},
{

File diff suppressed because one or more lines are too long