forked from D-Net/dnet-hadoop
the param map has been updated: now it accepts string parameters
This commit is contained in:
parent
ddd40540aa
commit
8c0d346005
|
@ -11,7 +11,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("alwaysMatch")
|
@ComparatorClass("alwaysMatch")
|
||||||
public class AlwaysMatch extends AbstractComparator {
|
public class AlwaysMatch extends AbstractComparator {
|
||||||
|
|
||||||
public AlwaysMatch(final Map<String, Number> params){
|
public AlwaysMatch(final Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,9 @@ import java.util.Set;
|
||||||
@ComparatorClass("cityMatch")
|
@ComparatorClass("cityMatch")
|
||||||
public class CityMatch extends AbstractComparator {
|
public class CityMatch extends AbstractComparator {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
public CityMatch(Map<String, Number> params) {
|
public CityMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
@ -29,8 +29,8 @@ public class CityMatch extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
Set<String> codes1 = citiesToCodes(cities1);
|
Set<String> codes1 = citiesToCodes(cities1);
|
||||||
Set<String> codes2 = citiesToCodes(cities2);
|
Set<String> codes2 = citiesToCodes(cities2);
|
||||||
|
@ -41,7 +41,7 @@ public class CityMatch extends AbstractComparator {
|
||||||
else {
|
else {
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
return -1; //undefined if one of the two has no cities
|
return -1; //undefined if one of the two has no cities
|
||||||
return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0;
|
return commonElementsPercentage(codes1, codes2) > Integer.parseInt(params.getOrDefault("threshold", "0")) ? 1.0 : 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Class Contains match
|
||||||
|
*
|
||||||
|
* @author miconis
|
||||||
|
* */
|
||||||
|
@ComparatorClass("containsMatch")
|
||||||
|
public class ContainsMatch extends AbstractComparator {
|
||||||
|
|
||||||
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
public ContainsMatch(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
|
//read parameters
|
||||||
|
boolean caseSensitive = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||||
|
String string = params.get("string");
|
||||||
|
String agg = params.get("bool");
|
||||||
|
|
||||||
|
String ca = a;
|
||||||
|
String cb = b;
|
||||||
|
if (!caseSensitive) {
|
||||||
|
ca = a.toLowerCase();
|
||||||
|
cb = b.toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(agg) {
|
||||||
|
case "AND":
|
||||||
|
if(ca.contains(string) && cb.contains(string))
|
||||||
|
return 1.0;
|
||||||
|
break;
|
||||||
|
case "OR":
|
||||||
|
if(ca.contains(string) || cb.contains(string))
|
||||||
|
return 1.0;
|
||||||
|
break;
|
||||||
|
case "XOR":
|
||||||
|
if(ca.contains(string) ^ cb.contains(string))
|
||||||
|
return 1.0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
@ -16,7 +15,7 @@ public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||||
|
|
||||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||||
|
|
||||||
public DoiExactMatch(final Map<String, Number> params) {
|
public DoiExactMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("domainExactMatch")
|
@ComparatorClass("domainExactMatch")
|
||||||
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||||
|
|
||||||
public DomainExactMatch(final Map<String, Number> params) {
|
public DomainExactMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("exactMatch")
|
@ComparatorClass("exactMatch")
|
||||||
public class ExactMatch extends AbstractComparator {
|
public class ExactMatch extends AbstractComparator {
|
||||||
|
|
||||||
public ExactMatch(Map<String, Number> params){
|
public ExactMatch(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("exactMatchIgnoreCase")
|
@ComparatorClass("exactMatchIgnoreCase")
|
||||||
public class ExactMatchIgnoreCase extends AbstractComparator {
|
public class ExactMatchIgnoreCase extends AbstractComparator {
|
||||||
|
|
||||||
public ExactMatchIgnoreCase(Map<String, Number> params) {
|
public ExactMatchIgnoreCase(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("jaroWinkler")
|
@ComparatorClass("jaroWinkler")
|
||||||
public class JaroWinkler extends AbstractComparator {
|
public class JaroWinkler extends AbstractComparator {
|
||||||
|
|
||||||
public JaroWinkler(Map<String, Number> params){
|
public JaroWinkler(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,9 @@ import java.util.Set;
|
||||||
@ComparatorClass("jaroWinklerNormalizedName")
|
@ComparatorClass("jaroWinklerNormalizedName")
|
||||||
public class JaroWinklerNormalizedName extends AbstractComparator {
|
public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
public JaroWinklerNormalizedName(Map<String, Number> params){
|
public JaroWinklerNormalizedName(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
@ -39,11 +39,11 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
ca = removeKeywords(ca, keywords1);
|
ca = removeKeywords(ca, keywords1);
|
||||||
ca = removeKeywords(ca, cities1);
|
ca = removeKeywords(ca, cities1);
|
||||||
|
|
|
@ -12,7 +12,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("jaroWinklerTitle")
|
@ComparatorClass("jaroWinklerTitle")
|
||||||
public class JaroWinklerTitle extends AbstractComparator {
|
public class JaroWinklerTitle extends AbstractComparator {
|
||||||
|
|
||||||
public JaroWinklerTitle(Map<String, Number> params){
|
public JaroWinklerTitle(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,9 @@ import java.util.Set;
|
||||||
@ComparatorClass("keywordMatch")
|
@ComparatorClass("keywordMatch")
|
||||||
public class KeywordMatch extends AbstractComparator {
|
public class KeywordMatch extends AbstractComparator {
|
||||||
|
|
||||||
Map<String, Number> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
public KeywordMatch(Map<String, Number> params) {
|
public KeywordMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
@ -29,8 +29,8 @@ public class KeywordMatch extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||||
|
@ -41,7 +41,7 @@ public class KeywordMatch extends AbstractComparator {
|
||||||
else {
|
else {
|
||||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||||
return -1; //undefined if one of the two has no keywords
|
return -1; //undefined if one of the two has no keywords
|
||||||
return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0;
|
return commonElementsPercentage(codes1, codes2) > Double.parseDouble(params.getOrDefault("threshold", "0.0")) ? 1.0 : 0.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("level2JaroWinkler")
|
@ComparatorClass("level2JaroWinkler")
|
||||||
public class Level2JaroWinkler extends AbstractComparator {
|
public class Level2JaroWinkler extends AbstractComparator {
|
||||||
|
|
||||||
public Level2JaroWinkler(Map<String, Number> params){
|
public Level2JaroWinkler(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("level2JaroWinklerTitle")
|
@ComparatorClass("level2JaroWinklerTitle")
|
||||||
public class Level2JaroWinklerTitle extends AbstractComparator {
|
public class Level2JaroWinklerTitle extends AbstractComparator {
|
||||||
|
|
||||||
public Level2JaroWinklerTitle(Map<String,Number> params){
|
public Level2JaroWinklerTitle(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("level2Levenstein")
|
@ComparatorClass("level2Levenstein")
|
||||||
public class Level2Levenstein extends AbstractComparator {
|
public class Level2Levenstein extends AbstractComparator {
|
||||||
|
|
||||||
public Level2Levenstein(Map<String,Number> params){
|
public Level2Levenstein(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("levenstein")
|
@ComparatorClass("levenstein")
|
||||||
public class Levenstein extends AbstractComparator {
|
public class Levenstein extends AbstractComparator {
|
||||||
|
|
||||||
public Levenstein(Map<String,Number> params){
|
public Levenstein(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class LevensteinTitle extends AbstractComparator {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||||
|
|
||||||
public LevensteinTitle(Map<String,Number> params){
|
public LevensteinTitle(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||||
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
||||||
|
|
||||||
public LevensteinTitleIgnoreVersion(Map<String,Number> params){
|
public LevensteinTitleIgnoreVersion(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("mustBeDifferent")
|
@ComparatorClass("mustBeDifferent")
|
||||||
public class MustBeDifferent extends AbstractComparator {
|
public class MustBeDifferent extends AbstractComparator {
|
||||||
|
|
||||||
public MustBeDifferent(Map<String,Number> params){
|
public MustBeDifferent(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("null")
|
@ComparatorClass("null")
|
||||||
public class NullDistanceAlgo implements Comparator {
|
public class NullDistanceAlgo implements Comparator {
|
||||||
|
|
||||||
public NullDistanceAlgo(Map<String, Number> params){
|
public NullDistanceAlgo(Map<String, String> params){
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -20,9 +20,9 @@ import java.util.stream.Collectors;
|
||||||
public class PidMatch extends AbstractComparator {
|
public class PidMatch extends AbstractComparator {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
public PidMatch(final Map<String, Number> params) {
|
public PidMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
@ -50,7 +50,7 @@ public class PidMatch extends AbstractComparator {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (double)incommon / (incommon + simDiff) > params.getOrDefault("threshold", 0.5).doubleValue() ? 1 : 0;
|
return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ public class SizeMatch extends AbstractComparator {
|
||||||
* @param params
|
* @param params
|
||||||
* the parameters
|
* the parameters
|
||||||
*/
|
*/
|
||||||
public SizeMatch(final Map<String, Number> params) {
|
public SizeMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ import java.util.Map;
|
||||||
@ComparatorClass("sortedJaroWinkler")
|
@ComparatorClass("sortedJaroWinkler")
|
||||||
public class SortedJaroWinkler extends AbstractSortedComparator {
|
public class SortedJaroWinkler extends AbstractSortedComparator {
|
||||||
|
|
||||||
public SortedJaroWinkler(Map<String,Number> params){
|
public SortedJaroWinkler(Map<String,String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
||||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
public SortedLevel2JaroWinkler(final Map<String, Number> params){
|
public SortedLevel2JaroWinkler(final Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,9 +31,9 @@ public class SubStringLevenstein extends AbstractComparator {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
public SubStringLevenstein(Map<String, Number> params){
|
public SubStringLevenstein(Map<String, String> params){
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
this.limit = params.get("limit").intValue();
|
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -17,7 +17,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
@ComparatorClass("titleVersionMatch")
|
@ComparatorClass("titleVersionMatch")
|
||||||
public class TitleVersionMatch extends AbstractComparator {
|
public class TitleVersionMatch extends AbstractComparator {
|
||||||
|
|
||||||
public TitleVersionMatch(final Map<String, Number> params) {
|
public TitleVersionMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,19 +12,19 @@ import java.util.Map;
|
||||||
@ComparatorClass("urlMatcher")
|
@ComparatorClass("urlMatcher")
|
||||||
public class UrlMatcher extends Levenstein {
|
public class UrlMatcher extends Levenstein {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
public UrlMatcher(Map<String, Number> params){
|
public UrlMatcher(Map<String, String> params){
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
public UrlMatcher(double weight, Map<String, String> params) {
|
||||||
super(weight);
|
super(weight);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(Map<String, Number> params) {
|
public void setParams(Map<String, String> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,8 +37,8 @@ public class UrlMatcher extends Levenstein {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Double hostW = params.get("host").doubleValue();
|
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
|
||||||
Double pathW = params.get("path").doubleValue();
|
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
|
||||||
|
|
||||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||||
return hostW * 0.5;
|
return hostW * 0.5;
|
||||||
|
|
|
@ -18,7 +18,7 @@ public class YearMatch extends AbstractComparator {
|
||||||
|
|
||||||
private int limit = 4;
|
private int limit = 4;
|
||||||
|
|
||||||
public YearMatch(final Map<String, Number> params) {
|
public YearMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,13 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
||||||
/** The weight. */
|
/** The weight. */
|
||||||
protected double weight = 0.0;
|
protected double weight = 0.0;
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
protected AbstractComparator(Map<String, Number> params) {
|
protected AbstractComparator(Map<String, String> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected AbstractComparator(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.weight = 1.0;
|
this.weight = 1.0;
|
||||||
this.ssalgo = ssalgo;
|
this.ssalgo = ssalgo;
|
||||||
|
|
|
@ -23,8 +23,8 @@ public abstract class AbstractSortedComparator extends AbstractComparator {
|
||||||
super(weight, ssalgo);
|
super(weight, ssalgo);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected AbstractSortedComparator(final Map<String, Number> params, final AbstractStringDistance ssalgo){
|
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||||
super(params.get("weight").doubleValue(), ssalgo);
|
super(Double.parseDouble(params.get("weight")), ssalgo);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -12,7 +12,7 @@ public class FieldConf implements Serializable {
|
||||||
private String field; //name of the field on which apply the comparator
|
private String field; //name of the field on which apply the comparator
|
||||||
private String comparator; //comparator name
|
private String comparator; //comparator name
|
||||||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||||
private Map<String,Number> params; //parameters
|
private Map<String,String> params; //parameters
|
||||||
|
|
||||||
private boolean countIfUndefined;
|
private boolean countIfUndefined;
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ public class FieldConf implements Serializable {
|
||||||
public FieldConf() {
|
public FieldConf() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean countIfUndefined) {
|
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
this.comparator = comparator;
|
this.comparator = comparator;
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
|
@ -59,11 +59,11 @@ public class FieldConf implements Serializable {
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Number> getParams() {
|
public Map<String, String> getParams() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(Map<String, Number> params) {
|
public void setParams(Map<String, String> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
|
||||||
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField(), new FieldStats(weight, result, fieldConf.isCountIfUndefined()));
|
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, result, fieldConf.isCountIfUndefined()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ public class TreeProcessor{
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||||
|
|
||||||
//evaluate the decision tree
|
//evaluate the decision tree
|
||||||
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PaceResolver implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Comparator getComparator(String name, Map<String, Number> params) throws PaceException {
|
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
|
||||||
try {
|
try {
|
||||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||||
|
|
|
@ -2,9 +2,12 @@ package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
import eu.dnetlib.pace.tree.CityMatch;
|
import eu.dnetlib.pace.tree.CityMatch;
|
||||||
|
import eu.dnetlib.pace.tree.ContainsMatch;
|
||||||
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.tree.KeywordMatch;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
@ -17,13 +20,13 @@ import static junit.framework.Assert.assertTrue;
|
||||||
|
|
||||||
public class ComparatorTest extends AbstractPaceFunctions {
|
public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, String> params;
|
||||||
private DedupConfig conf;
|
private DedupConfig conf;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() {
|
public void setup() {
|
||||||
params = new HashMap<>();
|
params = new HashMap<>();
|
||||||
params.put("weight", 1.0);
|
params.put("weight", "1.0");
|
||||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
|
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -34,103 +37,6 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
|
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName() {
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertEquals(0.0, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName2() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
|
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName3() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertEquals(0.0, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName4() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertEquals(1.0, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName5() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertEquals(1.0, result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName6() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertTrue(result > 0.9);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName7() {
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
assertTrue(result > 0.9);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName8() {
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName9() {
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testJaroWinklerNormalizedName10(){
|
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void cityMatchTest() {
|
public void cityMatchTest() {
|
||||||
final CityMatch cityMatch = new CityMatch(params);
|
final CityMatch cityMatch = new CityMatch(params);
|
||||||
|
@ -147,6 +53,53 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
//both names with cities (different)
|
//both names with cities (different)
|
||||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
||||||
|
|
||||||
|
//particular cases
|
||||||
|
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||||
|
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
|
||||||
|
}
|
||||||
|
|
||||||
|
// @Test
|
||||||
|
// public void testJaroWinklerNormalizedName6() {
|
||||||
|
//
|
||||||
|
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
// double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
||||||
|
//
|
||||||
|
// System.out.println("result = " + result);
|
||||||
|
// assertTrue(result > 0.9);
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
// @Test
|
||||||
|
// public void testJaroWinklerNormalizedName10(){
|
||||||
|
//
|
||||||
|
// final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
//
|
||||||
|
// double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
||||||
|
//
|
||||||
|
// System.out.println("result = " + result);
|
||||||
|
// }
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void keywordMatchTest(){
|
||||||
|
params.put("threshold", "0.4");
|
||||||
|
|
||||||
|
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
||||||
|
|
||||||
|
assertEquals(1.0, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||||
|
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
||||||
|
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
||||||
|
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void containsMatchTest(){
|
||||||
|
|
||||||
|
params.put("string", "openorgs");
|
||||||
|
params.put("bool", "XOR");
|
||||||
|
params.put("caseSensitive", "false");
|
||||||
|
|
||||||
|
final ContainsMatch containsMatch = new ContainsMatch(params);
|
||||||
|
|
||||||
|
assertEquals(0.0, containsMatch.distance("openorgs", "openorgs", conf));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ import java.util.Map;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
|
||||||
|
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
|
|
|
@ -8,11 +8,11 @@ import java.util.Map;
|
||||||
|
|
||||||
public class UtilTest {
|
public class UtilTest {
|
||||||
|
|
||||||
Map<String, Number> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp(){
|
public void setUp(){
|
||||||
params = new HashMap<String, Number>();
|
params = new HashMap<String, String>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue