forked from D-Net/dnet-hadoop
update in the implementation of the tree: addition of new logic aggregations and statistics
This commit is contained in:
parent
b3748b8d77
commit
4dce785375
|
@ -225,17 +225,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return s.trim();
|
||||
}
|
||||
|
||||
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
||||
|
||||
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
||||
Set<String> k2 = keywordsToCodes(s2, translationMap);
|
||||
|
||||
if (k1.isEmpty() || k2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
return commonElementsPercentage(k1, k2);
|
||||
}
|
||||
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
|
||||
|
||||
int longer = (s1.size()>s2.size())?s1.size():s2.size();
|
||||
|
|
|
@ -50,5 +50,10 @@ public interface Config {
|
|||
public Map<String, List<String>> blacklists();
|
||||
|
||||
|
||||
/**
|
||||
* Translation map.
|
||||
*
|
||||
* @return the map
|
||||
* */
|
||||
public Map<String, String> translationMap();
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ public class DedupConfig implements Config, Serializable {
|
|||
private static Map<String, String> defaults = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
defaults.put("threshold", "0");
|
||||
defaults.put("dedupRun", "001");
|
||||
defaults.put("entityType", "result");
|
||||
defaults.put("subEntityType", "resulttype");
|
||||
|
@ -46,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
|
|||
defaults.put("rootBuilder", "result");
|
||||
defaults.put("includeChildren", "true");
|
||||
defaults.put("maxIterations", "20");
|
||||
defaults.put("idPath", "$.id");
|
||||
}
|
||||
|
||||
public DedupConfig() {}
|
||||
|
|
|
@ -86,7 +86,6 @@ public class WfConfig implements Serializable {
|
|||
/** The Jquery path to retrieve the identifier */
|
||||
private String idPath = "$.id";
|
||||
|
||||
|
||||
public WfConfig() {}
|
||||
|
||||
/**
|
||||
|
@ -100,8 +99,6 @@ public class WfConfig implements Serializable {
|
|||
* the root builder families
|
||||
* @param dedupRun
|
||||
* the dedup run
|
||||
* @param threshold
|
||||
* the threshold
|
||||
* @param skipList
|
||||
* the skip list
|
||||
* @param queueMaxSize
|
||||
|
@ -112,22 +109,25 @@ public class WfConfig implements Serializable {
|
|||
* the sliding window size
|
||||
* @param includeChildren
|
||||
* allows the children to be included in the representative records or not.
|
||||
* @param maxIterations
|
||||
* the maximum number of iterations
|
||||
* @param idPath
|
||||
* the path for the id of the entity
|
||||
*/
|
||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
||||
final double threshold,
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
|
||||
super();
|
||||
this.entityType = entityType;
|
||||
this.orderField = orderField;
|
||||
this.rootBuilder = rootBuilder;
|
||||
this.dedupRun = cleanupStringNumber(dedupRun);
|
||||
this.threshold = threshold;
|
||||
this.skipList = skipList;
|
||||
this.queueMaxSize = queueMaxSize;
|
||||
this.groupMaxSize = groupMaxSize;
|
||||
this.slidingWindowSize = slidingWindowSize;
|
||||
this.includeChildren = includeChildren;
|
||||
this.maxIterations = maxIterations;
|
||||
this.idPath = idPath;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -39,8 +39,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
|||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
//TODO change this implementation, it needs only to erase cities and keywords
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
|
|
|
@ -5,12 +5,12 @@ import eu.dnetlib.pace.util.PaceException;
|
|||
public enum AggType {
|
||||
|
||||
W_MEAN, //weighted mean
|
||||
AVG, //average
|
||||
AVG, //average
|
||||
SUM,
|
||||
MAX,
|
||||
MIN,
|
||||
AND, //used for necessary conditions
|
||||
OR; //used for sufficient conditions
|
||||
AND, //used for necessary conditions
|
||||
OR; //used for sufficient conditions
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
|
|
|
@ -8,11 +8,14 @@ import java.io.IOException;
|
|||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The class that defines the configuration of each field in the decision tree.
|
||||
* */
|
||||
public class FieldConf implements Serializable {
|
||||
|
||||
private String field; //name of the field on which apply the comparator
|
||||
private String comparator; //comparator name
|
||||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private String field; //name of the field on which apply the comparator
|
||||
private String comparator; //comparator name
|
||||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,String> params; //parameters
|
||||
|
||||
private boolean countIfUndefined;
|
||||
|
|
|
@ -8,10 +8,13 @@ import eu.dnetlib.pace.util.PaceException;
|
|||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The class that contains the result of each comparison in the decision tree
|
||||
* */
|
||||
public class FieldStats implements Serializable {
|
||||
|
||||
private double weight; //weight for the field (to be used in the aggregation)
|
||||
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition)
|
||||
private double threshold; //threshold for the field (to be used in some kind of aggregations)
|
||||
private double result; //the result of the comparison
|
||||
private Field a;
|
||||
private Field b;
|
||||
|
|
|
@ -6,7 +6,6 @@ import eu.dnetlib.pace.config.PaceConfig;
|
|||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
@ -34,21 +33,30 @@ public class TreeNodeDef implements Serializable {
|
|||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {
|
||||
}
|
||||
public TreeNodeDef() {}
|
||||
|
||||
//function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
|
||||
//for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
|
||||
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
|
||||
|
||||
stats.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
new FieldStats(
|
||||
weight,
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
|
||||
result,
|
||||
fieldConf.isCountIfUndefined(),
|
||||
doc1.getFieldMap().get(fieldConf.getField()),
|
||||
doc2.getFieldMap().get(fieldConf.getField())
|
||||
));
|
||||
}
|
||||
|
||||
return stats;
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
|
|||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import eu.dnetlib.pace.model.gt.Match;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
|
|
@ -4,7 +4,6 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
|
@ -36,11 +35,9 @@ public class BlockProcessor {
|
|||
this.dedupConf = dedupConf;
|
||||
}
|
||||
|
||||
|
||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
|
||||
if (documents.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
//process(q, context);
|
||||
process(prepare(documents), context);
|
||||
|
||||
} else {
|
||||
|
@ -54,7 +51,6 @@ public class BlockProcessor {
|
|||
|
||||
if (q.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
//process(q, context);
|
||||
process(simplifyQueue(q, key, context), context);
|
||||
|
||||
} else {
|
||||
|
@ -128,8 +124,6 @@ public class BlockProcessor {
|
|||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
||||
|
||||
// final PaceDocumentDistance algo = new PaceDocumentDistance();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
final MapDocument pivot = queue.remove();
|
||||
|
@ -140,8 +134,6 @@ public class BlockProcessor {
|
|||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
|
||||
|
||||
if (fieldPivot != null) {
|
||||
// System.out.println(idPivot + " --> " + fieldPivot);
|
||||
|
||||
int i = 0;
|
||||
for (final MapDocument curr : queue) {
|
||||
final String idCurr = curr.getIdentifier();
|
||||
|
|
Loading…
Reference in New Issue