update in the implementation of the tree: addition of new logic aggregations and statistics

This commit is contained in:
miconis 2020-01-14 11:42:43 +02:00
parent b3748b8d77
commit 4dce785375
11 changed files with 39 additions and 40 deletions

View File

@ -225,17 +225,6 @@ public abstract class AbstractPaceFunctions {
return s.trim(); return s.trim();
} }
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
Set<String> k1 = keywordsToCodes(s1, translationMap);
Set<String> k2 = keywordsToCodes(s2, translationMap);
if (k1.isEmpty() || k2.isEmpty())
return 1.0;
return commonElementsPercentage(k1, k2);
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2){ public double commonElementsPercentage(Set<String> s1, Set<String> s2){
int longer = (s1.size()>s2.size())?s1.size():s2.size(); int longer = (s1.size()>s2.size())?s1.size():s2.size();

View File

@ -50,5 +50,10 @@ public interface Config {
public Map<String, List<String>> blacklists(); public Map<String, List<String>> blacklists();
/**
* Translation map.
*
* @return the map
* */
public Map<String, String> translationMap(); public Map<String, String> translationMap();
} }

View File

@ -34,7 +34,6 @@ public class DedupConfig implements Config, Serializable {
private static Map<String, String> defaults = Maps.newHashMap(); private static Map<String, String> defaults = Maps.newHashMap();
static { static {
defaults.put("threshold", "0");
defaults.put("dedupRun", "001"); defaults.put("dedupRun", "001");
defaults.put("entityType", "result"); defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype"); defaults.put("subEntityType", "resulttype");
@ -46,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
defaults.put("rootBuilder", "result"); defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true"); defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20"); defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id");
} }
public DedupConfig() {} public DedupConfig() {}

View File

@ -86,7 +86,6 @@ public class WfConfig implements Serializable {
/** The Jquery path to retrieve the identifier */ /** The Jquery path to retrieve the identifier */
private String idPath = "$.id"; private String idPath = "$.id";
public WfConfig() {} public WfConfig() {}
/** /**
@ -100,8 +99,6 @@ public class WfConfig implements Serializable {
* the root builder families * the root builder families
* @param dedupRun * @param dedupRun
* the dedup run * the dedup run
* @param threshold
* the threshold
* @param skipList * @param skipList
* the skip list * the skip list
* @param queueMaxSize * @param queueMaxSize
@ -112,22 +109,25 @@ public class WfConfig implements Serializable {
* the sliding window size * the sliding window size
* @param includeChildren * @param includeChildren
* allows the children to be included in the representative records or not. * allows the children to be included in the representative records or not.
* @param maxIterations
* the maximum number of iterations
* @param idPath
* the path for the id of the entity
*/ */
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun, public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
final double threshold, final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
super(); super();
this.entityType = entityType; this.entityType = entityType;
this.orderField = orderField; this.orderField = orderField;
this.rootBuilder = rootBuilder; this.rootBuilder = rootBuilder;
this.dedupRun = cleanupStringNumber(dedupRun); this.dedupRun = cleanupStringNumber(dedupRun);
this.threshold = threshold;
this.skipList = skipList; this.skipList = skipList;
this.queueMaxSize = queueMaxSize; this.queueMaxSize = queueMaxSize;
this.groupMaxSize = groupMaxSize; this.groupMaxSize = groupMaxSize;
this.slidingWindowSize = slidingWindowSize; this.slidingWindowSize = slidingWindowSize;
this.includeChildren = includeChildren; this.includeChildren = includeChildren;
this.maxIterations = maxIterations; this.maxIterations = maxIterations;
this.idPath = idPath;
} }
/** /**

View File

@ -39,8 +39,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
//TODO change this implementation, it needs only to erase cities and keywords
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));

View File

@ -8,6 +8,9 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Map; import java.util.Map;
/**
* The class that defines the configuration of each field in the decision tree.
* */
public class FieldConf implements Serializable { public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator private String field; //name of the field on which apply the comparator

View File

@ -8,10 +8,13 @@ import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
/**
* The class that contains the result of each comparison in the decision tree
* */
public class FieldStats implements Serializable { public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation) private double weight; //weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition) private double threshold; //threshold for the field (to be used in some kind of aggregations)
private double result; //the result of the comparison private double result; //the result of the comparison
private Field a; private Field a;
private Field b; private Field b;

View File

@ -6,7 +6,6 @@ import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
@ -34,21 +33,30 @@ public class TreeNodeDef implements Serializable {
this.ignoreUndefined = ignoreUndefined; this.ignoreUndefined = ignoreUndefined;
} }
public TreeNodeDef() { public TreeNodeDef() {}
}
//function for the evaluation of the node
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(); TreeNodeStats stats = new TreeNodeStats();
//for each field in the node, it computes the
for (FieldConf fieldConf : fields) { for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight(); double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
result,
fieldConf.isCountIfUndefined(),
doc1.getFieldMap().get(fieldConf.getField()),
doc2.getFieldMap().get(fieldConf.getField())
));
} }
return stats; return stats;

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.*; import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.model.gt.Match;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;

View File

@ -4,7 +4,6 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig; import eu.dnetlib.pace.config.WfConfig;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
@ -36,11 +35,9 @@ public class BlockProcessor {
this.dedupConf = dedupConf; this.dedupConf = dedupConf;
} }
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) { public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
if (documents.size() > 1) { if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size()); // log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(prepare(documents), context); process(prepare(documents), context);
} else { } else {
@ -54,7 +51,6 @@ public class BlockProcessor {
if (q.size() > 1) { if (q.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size()); // log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(simplifyQueue(q, key, context), context); process(simplifyQueue(q, key, context), context);
} else { } else {
@ -128,8 +124,6 @@ public class BlockProcessor {
private void process(final Queue<MapDocument> queue, final Reporter context) { private void process(final Queue<MapDocument> queue, final Reporter context) {
// final PaceDocumentDistance algo = new PaceDocumentDistance();
while (!queue.isEmpty()) { while (!queue.isEmpty()) {
final MapDocument pivot = queue.remove(); final MapDocument pivot = queue.remove();
@ -140,8 +134,6 @@ public class BlockProcessor {
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue(); final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
if (fieldPivot != null) { if (fieldPivot != null) {
// System.out.println(idPivot + " --> " + fieldPivot);
int i = 0; int i = 0;
for (final MapDocument curr : queue) { for (final MapDocument curr : queue) {
final String idCurr = curr.getIdentifier(); final String idCurr = curr.getIdentifier();