jarowinklernormalizedname splitted in 3 different comparators: citymatch, keywordmatch and jarowinkler. Implementation of the TreeStatistic support functions

This commit is contained in:
miconis 2019-11-20 10:45:00 +01:00
parent c687956371
commit ddd40540aa
12 changed files with 334 additions and 99 deletions

View File

@ -13,7 +13,6 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
@Override @Override
protected Collection<String> doApply(final Config conf, String s) { protected Collection<String> doApply(final Config conf, String s) {
// TODO Auto-generated method stub
return null; return null;
} }

View File

@ -216,14 +216,19 @@ public abstract class AbstractPaceFunctions {
Set<String> k1 = keywordsToCodes(s1, translationMap); Set<String> k1 = keywordsToCodes(s1, translationMap);
Set<String> k2 = keywordsToCodes(s2, translationMap); Set<String> k2 = keywordsToCodes(s2, translationMap);
int longer = (k1.size()>k2.size())?k1.size():k2.size();
if (k1.isEmpty() || k2.isEmpty()) if (k1.isEmpty() || k2.isEmpty())
return 1.0; return 1.0;
else
return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer; return commonElementsPercentage(k1, k2);
} }
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
int longer = (s1.size()>s2.size())?s1.size():s2.size();
return (double)CollectionUtils.intersection(s1,s2).size()/(double)longer;
}
//convert the set of keywords to codes //convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) { public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractComparator {
private Map<String, Number> params;
public CityMatch(Map<String, Number> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0;
}
}
}

View File

@ -45,40 +45,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (checkCities(cities1,cities2)) { ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { if (ca.isEmpty() && cb.isEmpty())
return 1.0;
ca = removeKeywords(ca, keywords1); else
ca = removeKeywords(ca, cities1); return normalize(ssalgo.score(ca,cb));
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
}
return 0.0;
}
//returns true if at least 1 city is in common
//returns true if no cities are contained in names
//returns false if one of the two names have no city
public boolean checkCities(Set<String> s1, Set<String> s2){
Set<String> c1 = citiesToCodes(s1);
Set<String> c2 = citiesToCodes(s2);
if (c1.isEmpty() && c2.isEmpty())
return true;
else {
if (c1.isEmpty() ^ c2.isEmpty())
return false;
return CollectionUtils.intersection(c1, c2).size() > 0;
}
} }
@Override @Override

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractComparator {
Map<String, Number> params;
public KeywordMatch(Map<String, Number> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2) > params.getOrDefault("threshold", 0).intValue() ? 1.0 : 0.0;
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation)
private double result; //the result of the comparison
private boolean countIfUndefined;
public FieldStats(double weight, double result, boolean countIfUndefined) {
this.weight = weight;
this.result = result;
this.countIfUndefined = countIfUndefined;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public double getResult() {
return result;
}
public void setResult(double result) {
this.result = result;
}
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
@Override
public String toString(){
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -39,7 +39,6 @@ public class TreeNodeDef implements Serializable {
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(); TreeNodeStats stats = new TreeNodeStats();
stats.setFieldsCount(fields.size());
for (FieldConf fieldConf : fields) { for (FieldConf fieldConf : fields) {
@ -47,16 +46,7 @@ public class TreeNodeDef implements Serializable {
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
if (result == -1) { //if the comparison is undefined stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField(), new FieldStats(weight, result, fieldConf.isCountIfUndefined()));
stats.incrementUndefinedCount();
if (fieldConf.isCountIfUndefined()) { //if it must be taken into account, increment weights (i.e. the average would be lower)
stats.incrementWeightsSum(weight);
}
}
else { //if the field is not missing
stats.incrementScoresSum(weight * result);
stats.incrementWeightsSum(weight);
}
} }

View File

@ -1,90 +1,108 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class TreeNodeStats implements Serializable { public class TreeNodeStats implements Serializable {
private DescriptiveStatistics stats; private Map<String, FieldStats> results; //this is an accumulator for the results of the node
private int undefinedCount = 0; //counter for the number of undefined comparisons between the fields in the tree node
private int fieldsCount = 0;
private double weightsSum = 0.0;
public TreeNodeStats(){ public TreeNodeStats(){
this.stats = new DescriptiveStatistics(); this.results = new HashMap<>();
} }
public TreeNodeStats(int undefinedCount, int fieldsCount, double weightsSum) { public Map<String, FieldStats> getResults() {
this.undefinedCount = undefinedCount; return results;
this.fieldsCount = fieldsCount;
this.weightsSum = weightsSum;
} }
public DescriptiveStatistics getStats() { public void addFieldStats(String id, FieldStats fieldStats){
return stats; this.results.put(id, fieldStats);
} }
public void setStats(DescriptiveStatistics stats) { public int fieldsCount(){
this.stats = stats; return this.results.size();
} }
public int getUndefinedCount() { public int undefinedCount(){
int undefinedCount = 0;
for(FieldStats fs: this.results.values()){
if(fs.getResult() == -1)
undefinedCount ++;
}
return undefinedCount; return undefinedCount;
} }
public void setUndefinedCount(int undefinedCount) { public double scoreSum(){
this.undefinedCount = undefinedCount; double scoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
scoreSum += fs.getResult();
}
}
return scoreSum;
} }
public int getFieldsCount() { //return the sum of the weights without considering the fields with countIfMissing=false && result=-1
return fieldsCount; public double weightSum(){
double weightSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) {
weightSum += fs.getWeight();
}
}
return weightSum;
} }
public void setFieldsCount(int fields) { public double weightedScoreSum(){
this.fieldsCount = fields; double weightedScoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
weightedScoreSum += fs.getResult()*fs.getWeight();
}
}
return weightedScoreSum;
} }
public double getWeightsSum() { public double max(){
return weightsSum; double max = -1.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>max)
max = fs.getResult();
}
return max;
} }
public void setWeightsSum(double weightsSum) { public double min(){
this.weightsSum = weightsSum; double min = 100.0; //random high value
} for(FieldStats fs: this.results.values()){
if(fs.getResult()<min) {
public void incrementWeightsSum(double delta){ if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
this.weightsSum += delta; min = fs.getResult();
} }
}
public void incrementUndefinedCount(){ return min;
this.undefinedCount += 1;
}
public void incrementScoresSum(double delta){
this.stats.addValue(delta);
} }
public double getFinalScore(AggType aggregation){ public double getFinalScore(AggType aggregation){
switch (aggregation){ switch (aggregation){
case AVG: case AVG:
return stats.getMean(); return scoreSum()/fieldsCount();
case SUM: case SUM:
return stats.getSum(); return scoreSum();
case SC: case SC:
case OR: case OR:
case MAX: case MAX:
return stats.getMax(); return max();
case NC: case NC:
case AND: case AND:
case MIN: case MIN:
return stats.getMin(); return min();
case W_MEAN: case W_MEAN:
return stats.getSum()/weightsSum; return weightedScoreSum()/weightSum();
default: default:
return 0.0; return 0.0;
} }
} }
} }

View File

@ -6,12 +6,13 @@ import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import java.io.Serializable;
import java.util.Map; import java.util.Map;
/** /**
* The compare between two documents is given by the weighted mean of the field distances * The compare between two documents is given by the weighted mean of the field distances
*/ */
public class TreeProcessor { public class TreeProcessor{
private static final Log log = LogFactory.getLog(TreeProcessor.class); private static final Log log = LogFactory.getLog(TreeProcessor.class);
@ -24,10 +25,12 @@ public class TreeProcessor {
public boolean compare(final MapDocument a, final MapDocument b) { public boolean compare(final MapDocument a, final MapDocument b) {
//evaluate the decision tree //evaluate the decision tree
return evaluateTree(a, b) == MatchType.MATCH; return evaluateTree(a, b).getResult() == MatchType.MATCH;
} }
public MatchType evaluateTree(final MapDocument doc1, final MapDocument doc2){ public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2){
TreeStats treeStats = new TreeStats();
String current = "start"; String current = "start";
@ -39,9 +42,10 @@ public class TreeProcessor {
throw new PaceException("The Tree Node doesn't exist: " + current); throw new PaceException("The Tree Node doesn't exist: " + current);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(current, stats);
//if ignoreUndefined=false the miss is considered as undefined //if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) { if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
current = currentNode.getUndefined(); current = currentNode.getUndefined();
} }
//if ignoreUndefined=true the miss is ignored and the score computed anyway //if ignoreUndefined=true the miss is ignored and the score computed anyway
@ -54,7 +58,8 @@ public class TreeProcessor {
} }
return MatchType.parse(current); treeStats.setResult(MatchType.parse(current));
return treeStats;
} }
public double computeScore(final MapDocument doc1, final MapDocument doc2) { public double computeScore(final MapDocument doc1, final MapDocument doc2) {
@ -72,7 +77,7 @@ public class TreeProcessor {
score = stats.getFinalScore(currentNode.getAggregation()); score = stats.getFinalScore(currentNode.getAggregation());
//if ignoreUndefined=false the miss is considered as undefined //if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) { if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
current = currentNode.getUndefined(); current = currentNode.getUndefined();
} }
//if ignoreUndefined=true the miss is ignored and the score computed anyway //if ignoreUndefined=true the miss is ignored and the score computed anyway

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class TreeStats {
//<layer_id, <field:comparator, result>>
Map<String, TreeNodeStats> stats;
MatchType result;
public TreeStats(){
this.stats = new HashMap<>();
this.result = MatchType.NO_MATCH;
}
public MatchType getResult(){
return this.result;
}
public void setResult(MatchType result){
this.result = result;
}
public Map<String, TreeNodeStats> getStats() {
return stats;
}
public void setStats(Map<String, TreeNodeStats> stats) {
this.stats = stats;
}
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){
this.stats.put(layerID, treeNodeStats);
}
@Override
public String toString(){
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.comparators; package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.tree.CityMatch;
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before; import org.junit.Before;
@ -129,4 +130,24 @@ public class ComparatorTest extends AbstractPaceFunctions {
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@Test
public void cityMatchTest() {
final CityMatch cityMatch = new CityMatch(params);
//both names with no cities
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
//one of the two names with no cities
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
//both names with cities (same)
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
//both names with cities (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
}
} }

View File

@ -0,0 +1,23 @@
package eu.dnetlib.pace.util;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
public class UtilTest {
Map<String, Number> params;
@Before
public void setUp(){
params = new HashMap<String, Number>();
}
@Test
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
}
}