forked from D-Net/dnet-hadoop
minor changes in comparators
This commit is contained in:
parent
4dce785375
commit
5c8f6febee
|
@ -17,6 +17,7 @@ import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -51,6 +52,8 @@ public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||||
|
|
||||||
|
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||||
|
|
||||||
protected String concat(final List<String> l) {
|
protected String concat(final List<String> l) {
|
||||||
|
@ -58,7 +61,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String cleanup(final String s) {
|
protected String cleanup(final String s) {
|
||||||
final String s0 = s.toLowerCase();
|
final String s0 = unicodeNormalization(s.toLowerCase());
|
||||||
final String s1 = fixAliases(s0);
|
final String s1 = fixAliases(s0);
|
||||||
final String s2 = nfd(s1);
|
final String s2 = nfd(s1);
|
||||||
final String s3 = s2.replaceAll("–", " ");
|
final String s3 = s2.replaceAll("–", " ");
|
||||||
|
@ -136,7 +139,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String normalize(final String s) {
|
protected String normalize(final String s) {
|
||||||
return nfd(s)
|
return nfd(unicodeNormalization(s))
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||||
.replaceAll("[^ \\w]+", "")
|
.replaceAll("[^ \\w]+", "")
|
||||||
|
@ -151,6 +154,18 @@ public abstract class AbstractPaceFunctions {
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String unicodeNormalization(final String s) {
|
||||||
|
|
||||||
|
Matcher m = hexUnicodePattern.matcher(s);
|
||||||
|
StringBuffer buf = new StringBuffer(s.length());
|
||||||
|
while (m.find()) {
|
||||||
|
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||||
|
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||||
|
}
|
||||||
|
m.appendTail(buf);
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||||
final StringTokenizer st = new StringTokenizer(s);
|
final StringTokenizer st = new StringTokenizer(s);
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
|
@ -40,7 +40,7 @@ public class KeywordMatch extends AbstractComparator {
|
||||||
return 1.0;
|
return 1.0;
|
||||||
else {
|
else {
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
return -1; //undefined if one of the two has no keywords
|
return -1.0; //undefined if one of the two has no keywords
|
||||||
return commonElementsPercentage(codes1, codes2);
|
return commonElementsPercentage(codes1, codes2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||||
new FieldStats(
|
new FieldStats(
|
||||||
weight,
|
weight,
|
||||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
|
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "-1.0")),
|
||||||
result,
|
result,
|
||||||
fieldConf.isCountIfUndefined(),
|
fieldConf.isCountIfUndefined(),
|
||||||
doc1.getFieldMap().get(fieldConf.getField()),
|
doc1.getFieldMap().get(fieldConf.getField()),
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.comparators;
|
package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.tree.*;
|
import eu.dnetlib.pace.tree.*;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -30,7 +32,7 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
@Test
|
@Test
|
||||||
public void testCleanForSorting() {
|
public void testCleanForSorting() {
|
||||||
NGramUtils utils = new NGramUtils();
|
NGramUtils utils = new NGramUtils();
|
||||||
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
|
System.out.println(utils.cleanupForOrdering("University of Pisa"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -111,10 +113,15 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||||
public void jsonListMatchTest() {
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void jsonListMatchTest(){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,6 +7,7 @@ import eu.dnetlib.pace.model.FieldList;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
import eu.dnetlib.pace.tree.JsonListMatch;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -19,6 +20,15 @@ import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() {
|
||||||
|
params = new HashMap<>();
|
||||||
|
params.put("jpath_value", "$.value");
|
||||||
|
params.put("jpath_classid", "$.qualifier.classid");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void dedupConfigSerializationTest() {
|
public void dedupConfigSerializationTest() {
|
||||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
||||||
|
@ -67,13 +77,18 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
@Test
|
@Test
|
||||||
public void asMapDocumentTest() {
|
public void asMapDocumentTest() {
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.current.conf.json"));
|
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
||||||
|
|
||||||
final String json = readFromClasspath("organization.json");
|
final String json = readFromClasspath("publication.json");
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
||||||
|
|
||||||
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
||||||
|
|
||||||
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||||
|
|
||||||
|
jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -25,37 +25,13 @@
|
||||||
],
|
],
|
||||||
"includeChildren": "true",
|
"includeChildren": "true",
|
||||||
"maxIterations": 20,
|
"maxIterations": 20,
|
||||||
"idPath": "$.entity.id"
|
"idPath": "$.id"
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering": [
|
"clustering" : [
|
||||||
{
|
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||||
"name": "ngrampairs",
|
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
||||||
"fields": [
|
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
||||||
"title"
|
|
||||||
],
|
|
||||||
"params": {
|
|
||||||
"max": "1",
|
|
||||||
"ngramLen": "3"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "suffixprefix",
|
|
||||||
"fields": [
|
|
||||||
"title"
|
|
||||||
],
|
|
||||||
"params": {
|
|
||||||
"max": "1",
|
|
||||||
"len": "3"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "lowercase",
|
|
||||||
"fields": [
|
|
||||||
"doi"
|
|
||||||
],
|
|
||||||
"params": {}
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
"start": {
|
"start": {
|
||||||
|
@ -66,14 +42,13 @@
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "false",
|
"countIfUndefined": "false",
|
||||||
"params": {
|
"params": {
|
||||||
"threshold": "0.5",
|
|
||||||
"jpath_value": "$.value",
|
"jpath_value": "$.value",
|
||||||
"jpath_classid": "$.qualifier.classid"
|
"jpath_classid": "$.qualifier.classid"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1.0,
|
"threshold": 0.5,
|
||||||
"aggregation": "MAX",
|
"aggregation": "AVG",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "layer2",
|
"negative": "layer2",
|
||||||
"undefined": "layer2",
|
"undefined": "layer2",
|
||||||
|
@ -97,7 +72,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 1.0,
|
"threshold": 1.0,
|
||||||
"aggregation": "NC",
|
"aggregation": "AND",
|
||||||
"positive": "layer3",
|
"positive": "layer3",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "layer3",
|
"undefined": "layer3",
|
||||||
|
@ -107,14 +82,14 @@
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "title",
|
"field": "title",
|
||||||
"comparator": "LevensteinTitle",
|
"comparator": "levensteinTitle",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.99,
|
"threshold": 0.99,
|
||||||
"aggregation": "SUM",
|
"aggregation": "AVG",
|
||||||
"positive": "MATCH",
|
"positive": "MATCH",
|
||||||
"negative": "NO_MATCH",
|
"negative": "NO_MATCH",
|
||||||
"undefined": "NO_MATCH",
|
"undefined": "NO_MATCH",
|
||||||
|
@ -130,7 +105,7 @@
|
||||||
{
|
{
|
||||||
"name": "pid",
|
"name": "pid",
|
||||||
"type": "JSON",
|
"type": "JSON",
|
||||||
"path": "$.pid[*]",
|
"path": "$.pid",
|
||||||
"overrideMatch": "true"
|
"overrideMatch": "true"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue