minor changes in comparators

This commit is contained in:
miconis 2020-01-24 10:01:11 +01:00
parent 4dce785375
commit 5c8f6febee
7 changed files with 288 additions and 47 deletions

View File

@ -17,6 +17,7 @@ import java.io.IOException;
import java.io.StringWriter;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions {
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
protected String concat(final List<String> l) {
@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String s0 = s.toLowerCase();
final String s0 = unicodeNormalization(s.toLowerCase());
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions {
}
protected String normalize(final String s) {
return nfd(s)
return nfd(unicodeNormalization(s))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();

View File

@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator {
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no keywords
return -1.0; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}

View File

@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable {
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")),
result,
fieldConf.isCountIfUndefined(),
doc1.getFieldMap().get(fieldConf.getField()),

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before;
import org.junit.Test;
@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
System.out.println(utils.cleanupForOrdering("University of Pisa"));
}
@Test
@ -111,6 +113,11 @@ public class ComparatorTest extends AbstractPaceFunctions {
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
}
@Test

View File

@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull;
public class ConfigTest extends AbstractPaceTest {
private Map<String, String> params;
@Before
public void setup() {
params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
}
@Test
public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest {
@Test
public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
final String json = readFromClasspath("organization.json");
final String json = readFromClasspath("publication.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + mapDocument.getFieldMap());
JsonListMatch jsonListMatch = new JsonListMatch(params);
jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
}
@Test

View File

@ -25,37 +25,13 @@
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.entity.id"
"idPath": "$.id"
},
"pace": {
"clustering" : [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree": {
"start": {
@ -66,14 +42,13 @@
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"threshold": "0.5",
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
@ -97,7 +72,7 @@
}
],
"threshold": 1.0,
"aggregation": "NC",
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
@ -107,14 +82,14 @@
"fields": [
{
"field": "title",
"comparator": "LevensteinTitle",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "SUM",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -130,7 +105,7 @@
{
"name": "pid",
"type": "JSON",
"path": "$.pid[*]",
"path": "$.pid",
"overrideMatch": "true"
},
{

File diff suppressed because one or more lines are too long