diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 23ff7acd2..8aeb36a0f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -20,15 +20,17 @@ import java.util.*; import java.util.stream.Collectors; /** - * Set of common functions + * Set of common functions for the framework * * @author claudio * */ public abstract class AbstractPaceFunctions { + //city map to be used when translating the city names into codes private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); + //list of stopwords in different languages protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); @@ -36,15 +38,14 @@ public abstract class AbstractPaceFunctions { protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); + //blacklist of ngrams: to avoid generic keys protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; - private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; - private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; - - private static final String special_from = "İə"; - private static final String special_to = "Ie"; + private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; + private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + //doi prefix for normalization public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; protected final static FieldList EMPTY_FIELD = new FieldListImpl(); @@ -54,8 +55,7 @@ public abstract class AbstractPaceFunctions { } protected String cleanup(final String s) { - final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters? - final String s0 = ss.toLowerCase(); + final String s0 = s.toLowerCase(); final String s1 = fixAliases(s0); final String s2 = nfd(s1); final String s3 = s2.replaceAll("–", " "); @@ -63,15 +63,12 @@ public abstract class AbstractPaceFunctions { final String s5 = s4.replaceAll(""", " "); final String s6 = s5.replaceAll("−", " "); final String s7 = s6.replaceAll("([0-9]+)", " $1 "); - final String s8 = s7.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " "); - final String s9 = s8.replaceAll("\\n", " "); - final String s10 = s9.replaceAll("(?m)\\s+", " "); - final String s11 = s10.trim(); - return s11; - } - - protected String finalCleanup(final String s) { - return s.toLowerCase(); + final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); + final String s9 = s8.replaceAll("[\\p{Punct}]", " "); + final String s10 = s9.replaceAll("\\n", " "); + final String s11 = s10.replaceAll("(?m)\\s+", " "); + final String s12 = s11.trim(); + return s12; } protected boolean checkNumbers(final String a, final String b) { @@ -98,16 +95,6 @@ public abstract class AbstractPaceFunctions { return s.replaceAll("\\D", ""); } - //sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function - protected static String fixSpecial(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final char ch : Lists.charactersOf(s)) { - final int i = StringUtils.indexOf(special_from, ch); - sb.append(i >= 0 ? special_to.charAt(i) : ch); - } - return sb.toString(); - } - protected static String fixAliases(final String s) { final StringBuilder sb = new StringBuilder(); for (final char ch : Lists.charactersOf(s)) { @@ -134,20 +121,19 @@ public abstract class AbstractPaceFunctions { return s != null; } - // /////////////////////// - protected String normalize(final String s) { - return nfd(s).toLowerCase() + return nfd(s) + .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") .replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\d)+", " ") .replaceAll("(\\n)+", " ") .trim(); } - private String nfd(final String s) { + public String nfd(final String s) { return Normalizer.normalize(s, Normalizer.Form.NFD); } @@ -186,8 +172,6 @@ public abstract class AbstractPaceFunctions { return newset; } - // //////////////////// - public static Set loadFromClasspath(final String classpath) { final Set h = Sets.newHashSet(); try { @@ -217,17 +201,6 @@ public abstract class AbstractPaceFunctions { return m; } - //translate the string: replace the keywords with the code - public String translate(String s1, Map translationMap){ - final StringTokenizer st = new StringTokenizer(s1); - final StringBuilder sb = new StringBuilder(); - while (st.hasMoreTokens()){ - final String token = st.nextToken(); - sb.append(translationMap.getOrDefault(token,token) + " "); - } - return sb.toString().trim(); - } - public String removeKeywords(String s, Set keywords) { s = " " + s + " "; @@ -238,7 +211,6 @@ public abstract class AbstractPaceFunctions { return s.trim(); } - public double keywordsCompare(Set s1, Set s2, Map translationMap){ Set k1 = keywordsToCodes(s1, translationMap); @@ -252,23 +224,6 @@ public abstract class AbstractPaceFunctions { return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer; } - //returns true if at least 1 city is in common - //returns true if no cities are contained in names - //returns false if one of the two names have no city - public boolean sameCity(Set s1, Set s2){ - - Set c1 = citiesToCodes(s1); - Set c2 = citiesToCodes(s2); - - if (c1.isEmpty() && c2.isEmpty()) - return true; - else { - if (c1.isEmpty() ^ c2.isEmpty()) - return false; - return CollectionUtils.intersection(c1, c2).size() > 0; - } - } - //convert the set of keywords to codes public Set toCodes(Set keywords, Map translationMap) { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); @@ -294,7 +249,7 @@ public abstract class AbstractPaceFunctions { return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); } - //get the list of codes into the input string + //get the list of keywords into the input string public Set getKeywords(String s1, Map translationMap, int windowSize){ String s = s1; @@ -311,10 +266,10 @@ public abstract class AbstractPaceFunctions { while (length != 0) { for (int i = 0; i<=tokens.size()-length; i++){ - String candidate = Joiner.on(" ").join(tokens.subList(i, i + length)); + String candidate = concat(tokens.subList(i, i + length)); if (translationMap.containsKey(candidate)) { codes.add(candidate); - s = s.replace(candidate, ""); + s = s.replace(candidate, "").trim(); } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index beab7a85f..bcfa5a108 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -45,6 +45,7 @@ public class DedupConfig implements Config, Serializable { defaults.put("slidingWindowSize", "200"); defaults.put("rootBuilder", "result"); defaults.put("includeChildren", "true"); + defaults.put("maxIterations", "20"); } public DedupConfig() {} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 1d4a03e37..7a87a82a5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,7 +1,7 @@ package eu.dnetlib.pace.config; import com.google.common.collect.Maps; - +import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; @@ -9,11 +9,10 @@ import eu.dnetlib.pace.util.PaceResolver; import org.codehaus.jackson.annotate.JsonIgnore; import java.io.Serializable; -import java.text.Normalizer; import java.util.List; import java.util.Map; -public class PaceConfig implements Serializable { +public class PaceConfig extends AbstractPaceFunctions implements Serializable { private List model; @@ -46,7 +45,7 @@ public class PaceConfig implements Serializable { for (String key : synonyms.keySet()) { for (String term : synonyms.get(key)){ translationMap.put( - Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD), + normalize(term.toLowerCase()), key); } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index ddcfaaece..bd00f2fb9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -76,6 +76,12 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; + /** Default maximum number of iterations. */ + private final static int MAX_ITERATIONS = 20; + + /** Maximum number of iterations */ + private int maxIterations = MAX_ITERATIONS; + public WfConfig() {} /** @@ -104,7 +110,7 @@ public class WfConfig implements Serializable { */ public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun, final double threshold, - final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) { + final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) { super(); this.entityType = entityType; this.orderField = orderField; @@ -116,6 +122,7 @@ public class WfConfig implements Serializable { this.groupMaxSize = groupMaxSize; this.slidingWindowSize = slidingWindowSize; this.includeChildren = includeChildren; + this.maxIterations = maxIterations; } /** @@ -245,6 +252,15 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } + public int getMaxIterations() { + return maxIterations; + } + + public WfConfig setMaxIterations(int maxIterations) { + this.maxIterations = maxIterations; + return this; + } + /* * (non-Javadoc) * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 4c167806d..6899be109 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -5,6 +5,7 @@ import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.config.Config; +import org.apache.commons.collections.CollectionUtils; import java.util.Map; import java.util.Set; @@ -44,7 +45,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator { Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); - if (sameCity(cities1,cities2)) { + if (checkCities(cities1,cities2)) { if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) { @@ -64,6 +65,22 @@ public class JaroWinklerNormalizedName extends AbstractComparator { return 0.0; } + //returns true if at least 1 city is in common + //returns true if no cities are contained in names + //returns false if one of the two names have no city + public boolean checkCities(Set s1, Set s2){ + Set c1 = citiesToCodes(s1); + Set c2 = citiesToCodes(s2); + + if (c1.isEmpty() && c2.isEmpty()) + return true; + else { + if (c1.isEmpty() ^ c2.isEmpty()) + return false; + return CollectionUtils.intersection(c1, c2).size() > 0; + } + } + @Override public double getWeight() { return super.weight; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 6a19e6686..0a9ffdf16 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -31,10 +31,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator { if (check) return 0.5; - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return ssalgo.score(cca, ccb); + return ssalgo.score(ca, cb); } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 546284edf..e5edacfee 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -36,10 +36,7 @@ public class LevensteinTitle extends AbstractComparator { if (check) return 0.5; - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); } private double normalize(final double score, final int la, final int lb) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 83b80b1d8..89f3749b5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -37,10 +37,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - final String cca = finalCleanup(ca); - final String ccb = finalCleanup(cb); - - return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); } private double normalize(final double score, final int la, final int lb) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java index 8f7316ba5..697294a75 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -4,11 +4,15 @@ import eu.dnetlib.pace.util.PaceException; public enum AggType { - WEIGHTED_MEAN, - AVG, + W_MEAN, //weighted mean + AVG, //average SUM, MAX, - MIN; + MIN, + NC, //necessary condition + SC, //sufficient condition + AND, + OR; public static AggType getEnum(String value) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index 8524996f4..67c26e99a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Field; public interface Comparator { /* - * return : -1 -> can't decide (missing field) + * return : -1 -> can't decide (i.e. missing field) * >0 -> similarity degree (depends on the algorithm) * */ public double compare(Field a, Field b, Config conf); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 3dd4b0185..b25d2a03c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -14,25 +14,25 @@ public class FieldConf implements Serializable { private double weight = 1.0; //weight for the field (to be used in the aggregation) private Map params; //parameters - private boolean ignoreMissing; + private boolean countIfUndefined; - public boolean isIgnoreMissing() { - return ignoreMissing; + public boolean isCountIfUndefined() { + return countIfUndefined; } - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; } public FieldConf() { } - public FieldConf(String field, String comparator, double weight, Map params, boolean ignoreMissing) { + public FieldConf(String field, String comparator, double weight, Map params, boolean countIfUndefined) { this.field = field; this.comparator = comparator; this.weight = weight; this.params = params; - this.ignoreMissing = ignoreMissing; + this.countIfUndefined = countIfUndefined; } public String getField() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 1d187b056..4f95ad366 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -4,7 +4,6 @@ import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; -import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.map.ObjectMapper; import java.io.IOException; @@ -22,16 +21,16 @@ public class TreeNodeDef implements Serializable { private String negative; private String undefined; - boolean ignoreMissing; + boolean ignoreUndefined; - public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) { + public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) { this.fields = fields; this.aggregation = aggregation; this.threshold = threshold; this.positive = positive; this.negative = negative; this.undefined = undefined; - this.ignoreMissing = ignoreMissing; + this.ignoreUndefined = ignoreUndefined; } public TreeNodeDef() { @@ -48,9 +47,9 @@ public class TreeNodeDef implements Serializable { double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - if (result == -1) { //if the field is missing - stats.incrementMissCount(); - if (!fieldConf.isIgnoreMissing()) { + if (result == -1) { //if the comparison is undefined + stats.incrementUndefinedCount(); + if (fieldConf.isCountIfUndefined()) { //if it must be taken into account, increment weights (i.e. the average would be lower) stats.incrementWeightsSum(weight); } } @@ -117,12 +116,12 @@ public class TreeNodeDef implements Serializable { this.undefined = undefined; } - public boolean isIgnoreMissing() { - return ignoreMissing; + public boolean isIgnoreUndefined() { + return ignoreUndefined; } - public void setIgnoreMissing(boolean ignoreMissing) { - this.ignoreMissing = ignoreMissing; + public void setIgnoreUndefined(boolean ignoreUndefined) { + this.ignoreUndefined = ignoreUndefined; } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java index 1ae899637..8d313e8eb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -7,7 +7,7 @@ import java.io.Serializable; public class TreeNodeStats implements Serializable { private DescriptiveStatistics stats; - private int missCount = 0; + private int undefinedCount = 0; //counter for the number of undefined comparisons between the fields in the tree node private int fieldsCount = 0; private double weightsSum = 0.0; @@ -15,8 +15,8 @@ public class TreeNodeStats implements Serializable { this.stats = new DescriptiveStatistics(); } - public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) { - this.missCount = missCount; + public TreeNodeStats(int undefinedCount, int fieldsCount, double weightsSum) { + this.undefinedCount = undefinedCount; this.fieldsCount = fieldsCount; this.weightsSum = weightsSum; } @@ -29,12 +29,12 @@ public class TreeNodeStats implements Serializable { this.stats = stats; } - public int getMissCount() { - return missCount; + public int getUndefinedCount() { + return undefinedCount; } - public void setMissCount(int missCount) { - this.missCount = missCount; + public void setUndefinedCount(int undefinedCount) { + this.undefinedCount = undefinedCount; } public int getFieldsCount() { @@ -57,8 +57,8 @@ public class TreeNodeStats implements Serializable { this.weightsSum += delta; } - public void incrementMissCount(){ - this.missCount += 1; + public void incrementUndefinedCount(){ + this.undefinedCount += 1; } public void incrementScoresSum(double delta){ @@ -72,11 +72,15 @@ public class TreeNodeStats implements Serializable { return stats.getMean(); case SUM: return stats.getSum(); + case SC: + case OR: case MAX: return stats.getMax(); + case NC: + case AND: case MIN: return stats.getMin(); - case WEIGHTED_MEAN: + case W_MEAN: return stats.getSum()/weightsSum; default: return 0.0; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 70e96235a..50d3ec667 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -40,9 +40,11 @@ public class TreeProcessor { TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); - if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) { + //if ignoreUndefined=false the miss is considered as undefined + if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) { current = currentNode.getUndefined(); } + //if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { current = currentNode.getPositive(); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index 3da6f0a7e..14e1e8d0d 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace; +import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldListImpl; @@ -11,7 +12,7 @@ import java.io.StringWriter; import java.util.List; import java.util.stream.Collectors; -public abstract class AbstractPaceTest { +public abstract class AbstractPaceTest extends AbstractPaceFunctions { protected String readFromClasspath(final String filename) { final StringWriter sw = new StringWriter(); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 6eec8cbaa..ff7a49fbe 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Before public void setUp() throws Exception { params = Maps.newHashMap(); - conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class)); + conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class)); } @Test diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java index e15b54331..d92d4009a 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.common; +import org.junit.Assert; import org.junit.Test; import static junit.framework.Assert.assertEquals; @@ -7,6 +8,8 @@ import static junit.framework.Assert.assertTrue; public class PaceFunctionTest extends AbstractPaceFunctions { + private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; + @Test public void normalizePidTest(){ @@ -14,7 +17,6 @@ public class PaceFunctionTest extends AbstractPaceFunctions { assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); - } @Test @@ -22,4 +24,35 @@ public class PaceFunctionTest extends AbstractPaceFunctions { assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); } + + @Test + public void normalizeTest() { + Assert.assertEquals("universitat", normalize("Universität")); + + System.out.println(normalize("İstanbul Ticarət Universiteti")); + } + + @Test + public void cleanupTest() { + assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); + + + System.out.println("cleaned up : " + cleanup(TEST_STRING)); + } + + @Test + public void testGetNumbers() { + System.out.println("Numbers : " + getNumbers(TEST_STRING)); + } + + @Test + public void testRemoveSymbols() { + System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); + } + + @Test + public void testFixAliases() { + System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); + } + } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java similarity index 87% rename from dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java rename to dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index c3dc48282..4247d9be6 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -14,9 +14,8 @@ import java.util.Map; import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.assertTrue; -public class DistanceAlgoTest extends AbstractPaceFunctions { +public class ComparatorTest extends AbstractPaceFunctions { - private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private Map params; private DedupConfig conf; @@ -24,7 +23,8 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void setup() { params = new HashMap<>(); params.put("weight", 1.0); - conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class)); + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class)); + } @Test @@ -33,26 +33,6 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa")); } - @Test - public void testGetNumbers() { - System.out.println("Numbers : " + getNumbers(TEST_STRING)); - } - - @Test - public void testRemoveSymbols() { - System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); - } - - @Test - public void testFixAliases() { - System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); - } - - @Test - public void testCleanup() { - System.out.println("cleaned up : " + cleanup(TEST_STRING)); - } - @Test public void testJaroWinklerNormalizedName() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 9051049fb..33ef542da 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -3,6 +3,8 @@ package eu.dnetlib.pace.config; import eu.dnetlib.pace.AbstractPaceTest; import org.junit.Test; +import java.util.Map; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -11,15 +13,10 @@ public class ConfigTest extends AbstractPaceTest { @Test public void dedupConfigSerializationTest() { - final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf")); + final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf")); final String conf = cfgFromClasspath.toString(); -// System.out.println("*****SERIALIZED*****"); -// System.out.println(conf); -// System.out.println("*****FROM CLASSPATH*****"); -// System.out.println(readFromClasspath("result.pace.conf.json")); - final DedupConfig cfgFromSerialization = DedupConfig.load(conf); assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString()); @@ -27,29 +24,36 @@ public class ConfigTest extends AbstractPaceTest { assertNotNull(cfgFromClasspath); assertNotNull(cfgFromSerialization); - } @Test public void dedupConfigTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf")); System.out.println(load.toString()); } @Test - public void translationMapTest() { + public void initTranslationMapTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf")); + + Map translationMap = load.translationMap(); + + System.out.println("translationMap = " + translationMap.size()); + + for (String key: translationMap.keySet()) { + if (translationMap.get(key).equals("key::1")) + System.out.println("key = " + key); + } - System.out.println("translationMap = " + load.getPace().translationMap().toString()); } @Test public void emptyTranslationMapTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf")); assertEquals(0, load.getPace().translationMap().keySet().size()); } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf deleted file mode 100644 index 3af658574..000000000 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.test.conf +++ /dev/null @@ -1,40 +0,0 @@ -{ - "wf" : { - "threshold" : "0.9", - "dedupRun" : "001", - "entityType" : "organization", - "orderField" : "legalname", - "queueMaxSize" : "2000", - "groupMaxSize" : "50", - "slidingWindowSize" : "200", - "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, - { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } - ], - "strictConditions" : [ - { "name" : "exactMatch", "fields" : [ "gridid" ] } - ], - "conditions" : [ - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, - { "name" : "exactMatch", "fields" : [ "country" ] } - ], - "model" : [ - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, - { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, - { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, - { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } - ], - "blacklists" : { - "legalname" : [] - }, - "synonyms": { - } - } -} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf similarity index 85% rename from dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf rename to dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf index 017cbab93..510de0398 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf @@ -12,42 +12,28 @@ }, "pace" : { "clustering" : [ -<<<<<<< HEAD - { "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } -======= { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ->>>>>>> origin/master - ], - "sufficientConditions" : [ - { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } - ], -<<<<<<< HEAD - "necessaryConditions" : [ - { "name" : "exactMatch", "fieldsCount" : [ "country" ] }, - { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } -======= - "conditions" : [ - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, - { "name" : "exactMatch", "fields" : [ "country" ] } ->>>>>>> origin/master ], + "decisionTree" : { + "start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, + "layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, + "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} + }, "model" : [ - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, - { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, - { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, - { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, + { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, + { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], @@ -97,58 +83,58 @@ "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], - "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], - "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], - "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], - "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], - "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], - "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], - "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], - "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], - "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], - "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], - "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], - "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], - "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], - "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], - "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], - "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], - "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], - "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], - "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], - "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], - "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], - "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], - "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], - "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], - "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], - "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], - "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], - "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], - "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], - "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], - "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], - "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], - "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], - "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], - "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], - "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], - "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], - "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], - "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], - "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], - "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], - "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], - "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], - "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], - "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], - "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], - "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], - "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], - "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], - "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], - "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], - "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""], + "key::102": ["informatics","informatica","informática","informática","informatica",""], "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf new file mode 100644 index 000000000..d79b7758e --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf @@ -0,0 +1,38 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } + ], + "decisionTree" : { + "start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, + "layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, + "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} + }, + "model" : [ + { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, + { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, + { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} + ], + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + } + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json deleted file mode 100644 index 786424a34..000000000 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "10", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } - ], - "strictConditions" : [ - { "name" : "exactMatch", "fields" : [ "pid" ] } - ], - "conditions" : [ - { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } - ], - "model" : [ - { "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, - { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } - ], - "blacklists" : { - "title" : [ - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*" - ] } - } - -} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf new file mode 100644 index 000000000..115bb04f0 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf @@ -0,0 +1,51 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "decisionTree": { + "start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, + "layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, + "layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"} + }, + "model" : [ + { "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, + { "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } , + { "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] }, + "synonyms": {} + } + +}