Master branch updates from beta September 2023 #337
|
@ -225,17 +225,6 @@ public abstract class AbstractPaceFunctions {
|
||||||
return s.trim();
|
return s.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
|
||||||
|
|
||||||
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
|
||||||
Set<String> k2 = keywordsToCodes(s2, translationMap);
|
|
||||||
|
|
||||||
if (k1.isEmpty() || k2.isEmpty())
|
|
||||||
return 1.0;
|
|
||||||
|
|
||||||
return commonElementsPercentage(k1, k2);
|
|
||||||
}
|
|
||||||
|
|
||||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
|
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
|
||||||
|
|
||||||
int longer = (s1.size()>s2.size())?s1.size():s2.size();
|
int longer = (s1.size()>s2.size())?s1.size():s2.size();
|
||||||
|
|
|
@ -50,5 +50,10 @@ public interface Config {
|
||||||
public Map<String, List<String>> blacklists();
|
public Map<String, List<String>> blacklists();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Translation map.
|
||||||
|
*
|
||||||
|
* @return the map
|
||||||
|
* */
|
||||||
public Map<String, String> translationMap();
|
public Map<String, String> translationMap();
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,6 @@ public class DedupConfig implements Config, Serializable {
|
||||||
private static Map<String, String> defaults = Maps.newHashMap();
|
private static Map<String, String> defaults = Maps.newHashMap();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
defaults.put("threshold", "0");
|
|
||||||
defaults.put("dedupRun", "001");
|
defaults.put("dedupRun", "001");
|
||||||
defaults.put("entityType", "result");
|
defaults.put("entityType", "result");
|
||||||
defaults.put("subEntityType", "resulttype");
|
defaults.put("subEntityType", "resulttype");
|
||||||
|
@ -46,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
|
||||||
defaults.put("rootBuilder", "result");
|
defaults.put("rootBuilder", "result");
|
||||||
defaults.put("includeChildren", "true");
|
defaults.put("includeChildren", "true");
|
||||||
defaults.put("maxIterations", "20");
|
defaults.put("maxIterations", "20");
|
||||||
|
defaults.put("idPath", "$.id");
|
||||||
}
|
}
|
||||||
|
|
||||||
public DedupConfig() {}
|
public DedupConfig() {}
|
||||||
|
|
|
@ -86,7 +86,6 @@ public class WfConfig implements Serializable {
|
||||||
/** The Jquery path to retrieve the identifier */
|
/** The Jquery path to retrieve the identifier */
|
||||||
private String idPath = "$.id";
|
private String idPath = "$.id";
|
||||||
|
|
||||||
|
|
||||||
public WfConfig() {}
|
public WfConfig() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -100,8 +99,6 @@ public class WfConfig implements Serializable {
|
||||||
* the root builder families
|
* the root builder families
|
||||||
* @param dedupRun
|
* @param dedupRun
|
||||||
* the dedup run
|
* the dedup run
|
||||||
* @param threshold
|
|
||||||
* the threshold
|
|
||||||
* @param skipList
|
* @param skipList
|
||||||
* the skip list
|
* the skip list
|
||||||
* @param queueMaxSize
|
* @param queueMaxSize
|
||||||
|
@ -112,22 +109,25 @@ public class WfConfig implements Serializable {
|
||||||
* the sliding window size
|
* the sliding window size
|
||||||
* @param includeChildren
|
* @param includeChildren
|
||||||
* allows the children to be included in the representative records or not.
|
* allows the children to be included in the representative records or not.
|
||||||
|
* @param maxIterations
|
||||||
|
* the maximum number of iterations
|
||||||
|
* @param idPath
|
||||||
|
* the path for the id of the entity
|
||||||
*/
|
*/
|
||||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
||||||
final double threshold,
|
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
|
||||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
|
|
||||||
super();
|
super();
|
||||||
this.entityType = entityType;
|
this.entityType = entityType;
|
||||||
this.orderField = orderField;
|
this.orderField = orderField;
|
||||||
this.rootBuilder = rootBuilder;
|
this.rootBuilder = rootBuilder;
|
||||||
this.dedupRun = cleanupStringNumber(dedupRun);
|
this.dedupRun = cleanupStringNumber(dedupRun);
|
||||||
this.threshold = threshold;
|
|
||||||
this.skipList = skipList;
|
this.skipList = skipList;
|
||||||
this.queueMaxSize = queueMaxSize;
|
this.queueMaxSize = queueMaxSize;
|
||||||
this.groupMaxSize = groupMaxSize;
|
this.groupMaxSize = groupMaxSize;
|
||||||
this.slidingWindowSize = slidingWindowSize;
|
this.slidingWindowSize = slidingWindowSize;
|
||||||
this.includeChildren = includeChildren;
|
this.includeChildren = includeChildren;
|
||||||
this.maxIterations = maxIterations;
|
this.maxIterations = maxIterations;
|
||||||
|
this.idPath = idPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -39,8 +39,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
//TODO change this implementation, it needs only to erase cities and keywords
|
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,9 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The class that defines the configuration of each field in the decision tree.
|
||||||
|
* */
|
||||||
public class FieldConf implements Serializable {
|
public class FieldConf implements Serializable {
|
||||||
|
|
||||||
private String field; //name of the field on which apply the comparator
|
private String field; //name of the field on which apply the comparator
|
||||||
|
|
|
@ -8,10 +8,13 @@ import eu.dnetlib.pace.util.PaceException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The class that contains the result of each comparison in the decision tree
|
||||||
|
* */
|
||||||
public class FieldStats implements Serializable {
|
public class FieldStats implements Serializable {
|
||||||
|
|
||||||
private double weight; //weight for the field (to be used in the aggregation)
|
private double weight; //weight for the field (to be used in the aggregation)
|
||||||
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition)
|
private double threshold; //threshold for the field (to be used in some kind of aggregations)
|
||||||
private double result; //the result of the comparison
|
private double result; //the result of the comparison
|
||||||
private Field a;
|
private Field a;
|
||||||
private Field b;
|
private Field b;
|
||||||
|
|
|
@ -6,7 +6,6 @@ import eu.dnetlib.pace.config.PaceConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -34,21 +33,30 @@ public class TreeNodeDef implements Serializable {
|
||||||
this.ignoreUndefined = ignoreUndefined;
|
this.ignoreUndefined = ignoreUndefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeNodeDef() {
|
public TreeNodeDef() {}
|
||||||
}
|
|
||||||
|
|
||||||
|
//function for the evaluation of the node
|
||||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats();
|
||||||
|
|
||||||
|
//for each field in the node, it computes the
|
||||||
for (FieldConf fieldConf : fields) {
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
||||||
double weight = fieldConf.getWeight();
|
double weight = fieldConf.getWeight();
|
||||||
|
|
||||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
|
||||||
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
|
stats.addFieldStats(
|
||||||
|
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||||
|
new FieldStats(
|
||||||
|
weight,
|
||||||
|
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
|
||||||
|
result,
|
||||||
|
fieldConf.isCountIfUndefined(),
|
||||||
|
doc1.getFieldMap().get(fieldConf.getField()),
|
||||||
|
doc2.getFieldMap().get(fieldConf.getField())
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stats;
|
return stats;
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.*;
|
import eu.dnetlib.pace.model.*;
|
||||||
|
import eu.dnetlib.pace.model.gt.Match;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
|
@ -4,7 +4,6 @@ import com.google.common.collect.Lists;
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.config.WfConfig;
|
import eu.dnetlib.pace.config.WfConfig;
|
||||||
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
@ -36,11 +35,9 @@ public class BlockProcessor {
|
||||||
this.dedupConf = dedupConf;
|
this.dedupConf = dedupConf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
|
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
|
||||||
if (documents.size() > 1) {
|
if (documents.size() > 1) {
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||||
//process(q, context);
|
|
||||||
process(prepare(documents), context);
|
process(prepare(documents), context);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -54,7 +51,6 @@ public class BlockProcessor {
|
||||||
|
|
||||||
if (q.size() > 1) {
|
if (q.size() > 1) {
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||||
//process(q, context);
|
|
||||||
process(simplifyQueue(q, key, context), context);
|
process(simplifyQueue(q, key, context), context);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
@ -128,8 +124,6 @@ public class BlockProcessor {
|
||||||
|
|
||||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
||||||
|
|
||||||
// final PaceDocumentDistance algo = new PaceDocumentDistance();
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
while (!queue.isEmpty()) {
|
||||||
|
|
||||||
final MapDocument pivot = queue.remove();
|
final MapDocument pivot = queue.remove();
|
||||||
|
@ -140,8 +134,6 @@ public class BlockProcessor {
|
||||||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
|
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
|
||||||
|
|
||||||
if (fieldPivot != null) {
|
if (fieldPivot != null) {
|
||||||
// System.out.println(idPivot + " --> " + fieldPivot);
|
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (final MapDocument curr : queue) {
|
for (final MapDocument curr : queue) {
|
||||||
final String idCurr = curr.getIdentifier();
|
final String idCurr = curr.getIdentifier();
|
||||||
|
|
Loading…
Reference in New Issue