From 4dce78537597dc1257b4e8426ae74cca939723d8 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 14 Jan 2020 11:42:43 +0200 Subject: [PATCH] update in the implementation of the tree: addition of new logic aggregations and statistics --- .../pace/common/AbstractPaceFunctions.java | 11 ----------- .../java/eu/dnetlib/pace/config/Config.java | 5 +++++ .../eu/dnetlib/pace/config/DedupConfig.java | 2 +- .../java/eu/dnetlib/pace/config/WfConfig.java | 12 ++++++------ .../pace/tree/JaroWinklerNormalizedName.java | 2 -- .../eu/dnetlib/pace/tree/support/AggType.java | 6 +++--- .../dnetlib/pace/tree/support/FieldConf.java | 9 ++++++--- .../dnetlib/pace/tree/support/FieldStats.java | 5 ++++- .../dnetlib/pace/tree/support/TreeNodeDef.java | 18 +++++++++++++----- .../pace/tree/support/TreeProcessor.java | 1 + .../eu/dnetlib/pace/util/BlockProcessor.java | 8 -------- 11 files changed, 39 insertions(+), 40 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index b9f30ff694..705781e834 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -225,17 +225,6 @@ public abstract class AbstractPaceFunctions { return s.trim(); } - public double keywordsCompare(Set s1, Set s2, Map translationMap){ - - Set k1 = keywordsToCodes(s1, translationMap); - Set k2 = keywordsToCodes(s2, translationMap); - - if (k1.isEmpty() || k2.isEmpty()) - return 1.0; - - return commonElementsPercentage(k1, k2); - } - public double commonElementsPercentage(Set s1, Set s2){ int longer = (s1.size()>s2.size())?s1.size():s2.size(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 2cdace1deb..32f192fa0e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -50,5 +50,10 @@ public interface Config { public Map> blacklists(); + /** + * Translation map. + * + * @return the map + * */ public Map translationMap(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 261e13bf53..6f91ebf0c6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -34,7 +34,6 @@ public class DedupConfig implements Config, Serializable { private static Map defaults = Maps.newHashMap(); static { - defaults.put("threshold", "0"); defaults.put("dedupRun", "001"); defaults.put("entityType", "result"); defaults.put("subEntityType", "resulttype"); @@ -46,6 +45,7 @@ public class DedupConfig implements Config, Serializable { defaults.put("rootBuilder", "result"); defaults.put("includeChildren", "true"); defaults.put("maxIterations", "20"); + defaults.put("idPath", "$.id"); } public DedupConfig() {} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index a79d234d94..3cc5a38a2a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -86,7 +86,6 @@ public class WfConfig implements Serializable { /** The Jquery path to retrieve the identifier */ private String idPath = "$.id"; - public WfConfig() {} /** @@ -100,8 +99,6 @@ public class WfConfig implements Serializable { * the root builder families * @param dedupRun * the dedup run - * @param threshold - * the threshold * @param skipList * the skip list * @param queueMaxSize @@ -112,22 +109,25 @@ public class WfConfig implements Serializable { * the sliding window size * @param includeChildren * allows the children to be included in the representative records or not. + * @param maxIterations + * the maximum number of iterations + * @param idPath + * the path for the id of the entity */ public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun, - final double threshold, - final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) { + final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) { super(); this.entityType = entityType; this.orderField = orderField; this.rootBuilder = rootBuilder; this.dedupRun = cleanupStringNumber(dedupRun); - this.threshold = threshold; this.skipList = skipList; this.queueMaxSize = queueMaxSize; this.groupMaxSize = groupMaxSize; this.slidingWindowSize = slidingWindowSize; this.includeChildren = includeChildren; this.maxIterations = maxIterations; + this.idPath = idPath; } /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 76af574090..b89cffaed5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -39,8 +39,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - //TODO change this implementation, it needs only to erase cities and keywords - Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java index 6ea8172574..caf7cd4c88 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -5,12 +5,12 @@ import eu.dnetlib.pace.util.PaceException; public enum AggType { W_MEAN, //weighted mean - AVG, //average + AVG, //average SUM, MAX, MIN, - AND, //used for necessary conditions - OR; //used for sufficient conditions + AND, //used for necessary conditions + OR; //used for sufficient conditions public static AggType getEnum(String value) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 055eaaf186..44971876fc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -8,11 +8,14 @@ import java.io.IOException; import java.io.Serializable; import java.util.Map; +/** + * The class that defines the configuration of each field in the decision tree. + * */ public class FieldConf implements Serializable { - private String field; //name of the field on which apply the comparator - private String comparator; //comparator name - private double weight = 1.0; //weight for the field (to be used in the aggregation) + private String field; //name of the field on which apply the comparator + private String comparator; //comparator name + private double weight = 1.0; //weight for the field (to be used in the aggregation) private Map params; //parameters private boolean countIfUndefined; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index 072a2276ec..fb0b51b47a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -8,10 +8,13 @@ import eu.dnetlib.pace.util.PaceException; import java.io.IOException; import java.io.Serializable; +/** + * The class that contains the result of each comparison in the decision tree + * */ public class FieldStats implements Serializable { private double weight; //weight for the field (to be used in the aggregation) - private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition) + private double threshold; //threshold for the field (to be used in some kind of aggregations) private double result; //the result of the comparison private Field a; private Field b; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 037ed72c3d..7b13118a6a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -6,7 +6,6 @@ import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; - import java.io.IOException; import java.io.Serializable; import java.util.List; @@ -34,21 +33,30 @@ public class TreeNodeDef implements Serializable { this.ignoreUndefined = ignoreUndefined; } - public TreeNodeDef() { - } + public TreeNodeDef() {} + //function for the evaluation of the node public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { TreeNodeStats stats = new TreeNodeStats(); + //for each field in the node, it computes the for (FieldConf fieldConf : fields) { double weight = fieldConf.getWeight(); double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()))); - + stats.addFieldStats( + fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), + new FieldStats( + weight, + Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), + result, + fieldConf.isCountIfUndefined(), + doc1.getFieldMap().get(fieldConf.getField()), + doc2.getFieldMap().get(fieldConf.getField()) + )); } return stats; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 731f659b33..abc685ec6d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.*; +import eu.dnetlib.pace.model.gt.Match; import eu.dnetlib.pace.util.PaceException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index bc846e71d6..34a6aa2cf7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -4,7 +4,6 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; -//import eu.dnetlib.pace.distance.PaceDocumentDistance; import eu.dnetlib.pace.tree.support.TreeProcessor; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; @@ -36,11 +35,9 @@ public class BlockProcessor { this.dedupConf = dedupConf; } - public void processSortedBlock(final String key, final List documents, final Reporter context) { if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - //process(q, context); process(prepare(documents), context); } else { @@ -54,7 +51,6 @@ public class BlockProcessor { if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - //process(q, context); process(simplifyQueue(q, key, context), context); } else { @@ -128,8 +124,6 @@ public class BlockProcessor { private void process(final Queue queue, final Reporter context) { -// final PaceDocumentDistance algo = new PaceDocumentDistance(); - while (!queue.isEmpty()) { final MapDocument pivot = queue.remove(); @@ -140,8 +134,6 @@ public class BlockProcessor { final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue(); if (fieldPivot != null) { - // System.out.println(idPivot + " --> " + fieldPivot); - int i = 0; for (final MapDocument curr : queue) { final String idCurr = curr.getIdentifier();