forked from D-Net/dnet-hadoop
minor changes in comparators
This commit is contained in:
parent
4dce785375
commit
5c8f6febee
|
@ -17,6 +17,7 @@ import java.io.IOException;
|
|||
import java.io.StringWriter;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions {
|
|||
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
|
||||
protected String concat(final List<String> l) {
|
||||
|
@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
final String s0 = s.toLowerCase();
|
||||
final String s0 = unicodeNormalization(s.toLowerCase());
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
|
@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String normalize(final String s) {
|
||||
return nfd(s)
|
||||
return nfd(unicodeNormalization(s))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
|
@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions {
|
|||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
|
|
@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator {
|
|||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; //undefined if one of the two has no keywords
|
||||
return -1.0; //undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable {
|
|||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
new FieldStats(
|
||||
weight,
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")),
|
||||
result,
|
||||
fieldConf.isCountIfUndefined(),
|
||||
doc1.getFieldMap().get(fieldConf.getField()),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.comparators;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.*;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
@Test
|
||||
public void testCleanForSorting() {
|
||||
NGramUtils utils = new NGramUtils();
|
||||
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
|
||||
System.out.println(utils.cleanupForOrdering("University of Pisa"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -111,10 +113,15 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
|||
|
||||
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void jsonListMatchTest() {
|
||||
public void jsonListMatchTest(){
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList;
|
|||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull;
|
|||
|
||||
public class ConfigTest extends AbstractPaceTest {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dedupConfigSerializationTest() {
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||
|
@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
@Test
|
||||
public void asMapDocumentTest() {
|
||||
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||
|
||||
final String json = readFromClasspath("organization.json");
|
||||
final String json = readFromClasspath("publication.json");
|
||||
|
||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||
|
||||
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
||||
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
|
||||
jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -25,37 +25,13 @@
|
|||
],
|
||||
"includeChildren": "true",
|
||||
"maxIterations": 20,
|
||||
"idPath": "$.entity.id"
|
||||
"idPath": "$.id"
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "ngrampairs",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"ngramLen": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "suffixprefix",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "1",
|
||||
"len": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi"
|
||||
],
|
||||
"params": {}
|
||||
}
|
||||
"clustering" : [
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
|
@ -66,14 +42,13 @@
|
|||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"threshold": "0.5",
|
||||
"jpath_value": "$.value",
|
||||
"jpath_classid": "$.qualifier.classid"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "MAX",
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
|
@ -97,7 +72,7 @@
|
|||
}
|
||||
],
|
||||
"threshold": 1.0,
|
||||
"aggregation": "NC",
|
||||
"aggregation": "AND",
|
||||
"positive": "layer3",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
|
@ -107,14 +82,14 @@
|
|||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "LevensteinTitle",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "SUM",
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
|
@ -130,7 +105,7 @@
|
|||
{
|
||||
"name": "pid",
|
||||
"type": "JSON",
|
||||
"path": "$.pid[*]",
|
||||
"path": "$.pid",
|
||||
"overrideMatch": "true"
|
||||
},
|
||||
{
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue