Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
11 changed files with 39 additions and 40 deletions
Showing only changes of commit 4dce785375 - Show all commits

View File

@ -225,17 +225,6 @@ public abstract class AbstractPaceFunctions {
return s.trim();
}
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
Set<String> k1 = keywordsToCodes(s1, translationMap);
Set<String> k2 = keywordsToCodes(s2, translationMap);
if (k1.isEmpty() || k2.isEmpty())
return 1.0;
return commonElementsPercentage(k1, k2);
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2){
int longer = (s1.size()>s2.size())?s1.size():s2.size();

View File

@ -50,5 +50,10 @@ public interface Config {
public Map<String, List<String>> blacklists();
/**
* Translation map.
*
* @return the map
* */
public Map<String, String> translationMap();
}

View File

@ -34,7 +34,6 @@ public class DedupConfig implements Config, Serializable {
private static Map<String, String> defaults = Maps.newHashMap();
static {
defaults.put("threshold", "0");
defaults.put("dedupRun", "001");
defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype");
@ -46,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id");
}
public DedupConfig() {}

View File

@ -86,7 +86,6 @@ public class WfConfig implements Serializable {
/** The Jquery path to retrieve the identifier */
private String idPath = "$.id";
public WfConfig() {}
/**
@ -100,8 +99,6 @@ public class WfConfig implements Serializable {
* the root builder families
* @param dedupRun
* the dedup run
* @param threshold
* the threshold
* @param skipList
* the skip list
* @param queueMaxSize
@ -112,22 +109,25 @@ public class WfConfig implements Serializable {
* the sliding window size
* @param includeChildren
* allows the children to be included in the representative records or not.
* @param maxIterations
* the maximum number of iterations
* @param idPath
* the path for the id of the entity
*/
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
final double threshold,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
super();
this.entityType = entityType;
this.orderField = orderField;
this.rootBuilder = rootBuilder;
this.dedupRun = cleanupStringNumber(dedupRun);
this.threshold = threshold;
this.skipList = skipList;
this.queueMaxSize = queueMaxSize;
this.groupMaxSize = groupMaxSize;
this.slidingWindowSize = slidingWindowSize;
this.includeChildren = includeChildren;
this.maxIterations = maxIterations;
this.idPath = idPath;
}
/**

View File

@ -39,8 +39,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
//TODO change this implementation, it needs only to erase cities and keywords
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));

View File

@ -8,6 +8,9 @@ import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
/**
* The class that defines the configuration of each field in the decision tree.
* */
public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator

View File

@ -8,10 +8,13 @@ import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
/**
* The class that contains the result of each comparison in the decision tree
* */
public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in case it is a sufficient or a necessary condition)
private double threshold; //threshold for the field (to be used in some kind of aggregations)
private double result; //the result of the comparison
private Field a;
private Field b;

View File

@ -6,7 +6,6 @@ import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
@ -34,21 +33,30 @@ public class TreeNodeDef implements Serializable {
this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef() {
}
public TreeNodeDef() {}
//function for the evaluation of the node
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
//for each field in the node, it computes the
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
stats.addFieldStats(fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), new FieldStats(weight, Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")), result, fieldConf.isCountIfUndefined(), doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())));
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "0.5")),
result,
fieldConf.isCountIfUndefined(),
doc1.getFieldMap().get(fieldConf.getField()),
doc2.getFieldMap().get(fieldConf.getField())
));
}
return stats;

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.model.gt.Match;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

View File

@ -4,7 +4,6 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
@ -36,11 +35,9 @@ public class BlockProcessor {
this.dedupConf = dedupConf;
}
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(prepare(documents), context);
} else {
@ -54,7 +51,6 @@ public class BlockProcessor {
if (q.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(simplifyQueue(q, key, context), context);
} else {
@ -128,8 +124,6 @@ public class BlockProcessor {
private void process(final Queue<MapDocument> queue, final Reporter context) {
// final PaceDocumentDistance algo = new PaceDocumentDistance();
while (!queue.isEmpty()) {
final MapDocument pivot = queue.remove();
@ -140,8 +134,6 @@ public class BlockProcessor {
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
if (fieldPivot != null) {
// System.out.println(idPivot + " --> " + fieldPivot);
int i = 0;
for (final MapDocument curr : queue) {
final String idCurr = curr.getIdentifier();