minor changes in comparators

This commit is contained in:
miconis 2020-01-24 10:01:11 +01:00
parent 4dce785375
commit 5c8f6febee
7 changed files with 288 additions and 47 deletions

View File

@ -17,6 +17,7 @@ import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions {
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected final static FieldList EMPTY_FIELD = new FieldListImpl(); protected final static FieldList EMPTY_FIELD = new FieldListImpl();
protected String concat(final List<String> l) { protected String concat(final List<String> l) {
@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions {
} }
protected String cleanup(final String s) { protected String cleanup(final String s) {
final String s0 = s.toLowerCase(); final String s0 = unicodeNormalization(s.toLowerCase());
final String s1 = fixAliases(s0); final String s1 = fixAliases(s0);
final String s2 = nfd(s1); final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " "); final String s3 = s2.replaceAll("&ndash;", " ");
@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions {
} }
protected String normalize(final String s) { protected String normalize(final String s) {
return nfd(s) return nfd(unicodeNormalization(s))
.toLowerCase() .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "") .replaceAll("[^ \\w]+", "")
@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions {
return Normalizer.normalize(s, Normalizer.Form.NFD); return Normalizer.normalize(s, Normalizer.Form.NFD);
} }
public String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
protected String filterStopWords(final String s, final Set<String> stopwords) { protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s); final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();

View File

@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator {
return 1.0; return 1.0;
else { else {
if (codes1.isEmpty() ^ codes2.isEmpty()) if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no keywords return -1.0; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2); return commonElementsPercentage(codes1, codes2);
} }
} }

View File

@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable {
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats( new FieldStats(
weight, weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")),
result, result,
fieldConf.isCountIfUndefined(), fieldConf.isCountIfUndefined(),
doc1.getFieldMap().get(fieldConf.getField()), doc1.getFieldMap().get(fieldConf.getField()),

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.comparators; package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
@Test @Test
public void testCleanForSorting() { public void testCleanForSorting() {
NGramUtils utils = new NGramUtils(); NGramUtils utils = new NGramUtils();
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa")); System.out.println(utils.cleanupForOrdering("University of Pisa"));
} }
@Test @Test
@ -111,6 +113,11 @@ public class ComparatorTest extends AbstractPaceFunctions {
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
} }
@Test @Test

View File

@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch; import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.util.HashMap; import java.util.HashMap;
@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull;
public class ConfigTest extends AbstractPaceTest { public class ConfigTest extends AbstractPaceTest {
private Map<String, String> params;
@Before
public void setup() {
params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
}
@Test @Test
public void dedupConfigSerializationTest() { public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json")); final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest {
@Test @Test
public void asMapDocumentTest() { public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json")); DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
final String json = readFromClasspath("organization.json"); final String json = readFromClasspath("publication.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap()); System.out.println("mapDocument = " + mapDocument.getFieldMap());
JsonListMatch jsonListMatch = new JsonListMatch(params);
jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
} }
@Test @Test

View File

@ -25,37 +25,13 @@
], ],
"includeChildren": "true", "includeChildren": "true",
"maxIterations": 20, "maxIterations": 20,
"idPath": "$.entity.id" "idPath": "$.id"
}, },
"pace": { "pace": {
"clustering" : [ "clustering" : [
{ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
"name": "ngrampairs", { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
"fields": [ { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
], ],
"decisionTree": { "decisionTree": {
"start": { "start": {
@ -66,14 +42,13 @@
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": { "params": {
"threshold": "0.5",
"jpath_value": "$.value", "jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid" "jpath_classid": "$.qualifier.classid"
} }
} }
], ],
"threshold": 1.0, "threshold": 0.5,
"aggregation": "MAX", "aggregation": "AVG",
"positive": "MATCH", "positive": "MATCH",
"negative": "layer2", "negative": "layer2",
"undefined": "layer2", "undefined": "layer2",
@ -97,7 +72,7 @@
} }
], ],
"threshold": 1.0, "threshold": 1.0,
"aggregation": "NC", "aggregation": "AND",
"positive": "layer3", "positive": "layer3",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer3", "undefined": "layer3",
@ -107,14 +82,14 @@
"fields": [ "fields": [
{ {
"field": "title", "field": "title",
"comparator": "LevensteinTitle", "comparator": "levensteinTitle",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": {} "params": {}
} }
], ],
"threshold": 0.99, "threshold": 0.99,
"aggregation": "SUM", "aggregation": "AVG",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "NO_MATCH",
@ -130,7 +105,7 @@
{ {
"name": "pid", "name": "pid",
"type": "JSON", "type": "JSON",
"path": "$.pid[*]", "path": "$.pid",
"overrideMatch": "true" "overrideMatch": "true"
}, },
{ {

File diff suppressed because one or more lines are too long